// Anime4K_Upscale_Denoise_CNN_x2_VL
// 移植自 https://github.com/bloc97/Anime4K/blob/78e4f78f65b772e94bae6e7db5c49af1e889f784/glsl/Upscale%2BDenoise/Anime4K_Upscale_Denoise_CNN_x2_VL.glsl

//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME Anime4K_Upscale_Denoise_2
//!USE MulAdd
//!CAPABILITY FP16

#include "../StubDefs.hlsli"


//!TEXTURE
Texture2D INPUT;

//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D conv2d_tf;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D conv2d_tf1;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D conv2d_1_tf;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D conv2d_1_tf1;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D conv2d_2_tf;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D conv2d_2_tf1;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D conv2d_3_tf;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D conv2d_3_tf1;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D conv2d_4_tf;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D conv2d_4_tf1;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D conv2d_5_tf;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D conv2d_5_tf1;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D conv2d_6_tf;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D conv2d_6_tf1;

//!SAMPLER
//!FILTER POINT
SamplerState sam;

//!SAMPLER
//!FILTER LINEAR
SamplerState sam1;


//!PASS 1
//!DESC Conv-4x3x3x3
//!IN INPUT
//!OUT conv2d_tf, conv2d_tf1
//!BLOCK_SIZE 16
//!NUM_THREADS 64

void Pass1(uint2 blockStart, uint3 threadId) {
	uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	
	uint i, j;

	MF3 src[4][4];
	[unroll]
	for (i = 0; i <= 2; i += 2) {
		[unroll]
		for (j = 0; j <= 2; j += 2) {
			float2 tpos = (gxy + uint2(i, j)) * inputPt;
			const MF4 sr = INPUT.GatherRed(sam, tpos);
			const MF4 sg = INPUT.GatherGreen(sam, tpos);
			const MF4 sb = INPUT.GatherBlue(sam, tpos);

			// w z
			// x y
			src[i][j] = MF3(sr.w, sg.w, sb.w);
			src[i][j + 1] = MF3(sr.x, sg.x, sb.x);
			src[i + 1][j] = MF3(sr.z, sg.z, sb.z);
			src[i + 1][j + 1] = MF3(sr.y, sg.y, sb.y);
		}
	}

	[unroll]
	for (i = 1; i <= 2; ++i) {
		[unroll]
		for (j = 1; j <= 2; ++j) {
			uint2 destPos = gxy + uint2(i - 1, j - 1);

			if (i != 1 || j != 1) {
				if (destPos.x >= inputSize.x || destPos.y >= inputSize.y) {
					continue;
				}
			}

			MF4 target1 = { 0.007907974, -0.035503313, 0.057224784, -0.19763541 };
			target1 = MulAdd(src[i - 1][j - 1], MF3x4(0.28296316, -0.020139743, 0.1038232, 0.09352482, -0.16964972, 0.07910997, -0.049914766, -0.10661066, -0.121037185, -0.029087039, -0.02511847, -0.078911744), target1);
			target1 = MulAdd(src[i - 1][j], MF3x4(-0.3927183, 0.01805193, -0.031168332, -0.13300525, 0.20814548, 0.118818566, 0.1655351, 0.095023684, 0.17600809, -0.03928444, -0.014350658, 0.08458312), target1);
			target1 = MulAdd(src[i - 1][j + 1], MF3x4(0.079089314, -0.0421829, 0.05452305, -0.22055493, 0.013279097, -0.12875281, 0.02452735, -0.101503745, -0.085946664, 0.05539176, 0.022408713, 0.14837204), target1);
			target1 = MulAdd(src[i][j - 1], MF3x4(-0.102643915, -0.011254746, 0.1478563, 0.1030208, 0.12396588, 0.0016621432, 0.2551224, -0.10399001, -0.01068436, 0.07155532, -0.104522154, 0.026937222), target1);
			target1 = MulAdd(src[i][j], MF3x4(-0.8789423, 0.35707328, -0.29964274, -0.064913996, 0.4962815, 0.26001287, -0.9511284, 0.49574667, 0.39539725, 0.16308042, 0.119878456, -0.30259115), target1);
			target1 = MulAdd(src[i][j + 1], MF3x4(-0.08852938, -0.32612664, -0.006712046, 0.28693515, 0.06320871, -0.3322611, 0.04651086, -0.11020996, 0.01821082, -0.22851005, -0.07803438, 0.021527015), target1);
			target1 = MulAdd(src[i + 1][j - 1], MF3x4(0.12295851, -0.011285535, 0.015859747, 0.04005441, -0.018136669, 0.03171969, -0.0406123, -0.10731229, -0.12117574, 0.005033036, 0.047838476, 0.026843475), target1);
			target1 = MulAdd(src[i + 1][j], MF3x4(0.4655988, 0.05519082, 0.039515793, 0.28410903, -0.36144528, 0.13039446, 0.11338478, -0.2141387, -0.10026682, -0.07903024, -0.09410254, 0.043833878), target1);
			target1 = MulAdd(src[i + 1][j + 1], MF3x4(0.110124744, -0.024725702, 0.028102143, -0.09493807, -0.06455328, -0.15164614, 0.04425987, 0.15483347, -0.045039337, 0.07210396, -0.005390788, -0.03832707), target1);

			MF4 target2 = { 0.019286277, -0.033644073, 0.08196311, 0.0054393094 };
			target2 = MulAdd(src[i - 1][j - 1], MF3x4(-0.012326053, 0.050769784, 0.1278702, -0.100782245, 0.14329414, -0.054558773, 0.023473471, 0.056829426, 0.048292916, 0.0046510273, -0.11478287, 0.0011030561), target2);
			target2 = MulAdd(src[i - 1][j], MF3x4(0.29542983, -0.55061895, -0.068554066, 0.1433222, -0.072878316, 0.30201668, -0.2223378, -0.06704077, 0.16955832, 0.3279914, 0.17619601, -0.1276919), target2);
			target2 = MulAdd(src[i - 1][j + 1], MF3x4(0.09623417, 0.30559412, 0.094622105, -0.076706685, 0.07943858, -0.084815115, 0.12472551, 0.079850115, -0.13044213, -0.21300878, -0.095747225, 0.13412355), target2);
			target2 = MulAdd(src[i][j - 1], MF3x4(0.21291664, 0.17195296, -0.20080926, 0.1064855, 0.10228669, -0.09580175, -0.11217631, -0.09740562, -0.0033135475, -0.053094357, 0.2983595, 0.035281878), target2);
			target2 = MulAdd(src[i][j], MF3x4(-0.08955812, -0.45707774, -0.4606922, -0.5754473, -0.11395895, 0.33530128, 0.29705846, -0.18877256, -0.43502945, 0.114171304, -0.3750776, -0.081597246), target2);
			target2 = MulAdd(src[i][j + 1], MF3x4(-0.26109028, 0.02662961, -0.10441071, 0.11199392, -0.12038989, -0.09642296, -0.061320662, -0.33058178, 0.20212512, 0.00840794, 0.14357455, -0.038080238), target2);
			target2 = MulAdd(src[i + 1][j - 1], MF3x4(-0.09533881, -0.13644339, 0.068756215, 0.079305276, -0.053370547, 0.19572955, 0.0682981, 0.14469264, 0.15582883, -0.057183057, -0.13919263, -0.016394936), target2);
			target2 = MulAdd(src[i + 1][j], MF3x4(-0.041189935, 0.39878023, 0.028704925, 0.30194348, -0.04486593, -0.33899093, -0.103968106, 0.21802065, -0.077099144, -0.07389541, 0.18069103, 0.18894517), target2);
			target2 = MulAdd(src[i + 1][j + 1], MF3x4(-0.12399862, 0.19246885, 0.034825478, -0.0044787163, 0.13121822, -0.13573012, -0.030162754, 0.1899518, 0.102326415, -0.061512686, -0.005647928, -0.0937634), target2);

			conv2d_tf[destPos] = target1;
			conv2d_tf1[destPos] = target2;
		}
	}
}

//!PASS 2
//!DESC Conv-4x3x3x16
//!IN conv2d_tf, conv2d_tf1
//!OUT conv2d_1_tf, conv2d_1_tf1
//!BLOCK_SIZE 8
//!NUM_THREADS 64

void Pass2(uint2 blockStart, uint3 threadId) {
	uint2 gxy = Rmp8x8(threadId.x) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	float2 pos = (gxy + 0.5f) * inputPt;

	// [ a, d, g ]
	// [ b, e, h ]
	// [ c, f, i ]
	MF4 a1 = conv2d_tf.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0);
	MF4 b1 = conv2d_tf.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	MF4 c1 = conv2d_tf.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	MF4 d1 = conv2d_tf.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	MF4 e1 = conv2d_tf.SampleLevel(sam, pos, 0);
	MF4 f1 = conv2d_tf.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	MF4 g1 = conv2d_tf.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	MF4 h1 = conv2d_tf.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	MF4 i1 = conv2d_tf.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0);

	MF4 na1 = max(-a1, 0);
	MF4 nb1 = max(-b1, 0);
	MF4 nc1 = max(-c1, 0);
	MF4 nd1 = max(-d1, 0);
	MF4 ne1 = max(-e1, 0);
	MF4 nf1 = max(-f1, 0);
	MF4 ng1 = max(-g1, 0);
	MF4 nh1 = max(-h1, 0);
	MF4 ni1 = max(-i1, 0);

	a1 = max(a1, 0);
	b1 = max(b1, 0);
	c1 = max(c1, 0);
	d1 = max(d1, 0);
	e1 = max(e1, 0);
	f1 = max(f1, 0);
	g1 = max(g1, 0);
	h1 = max(h1, 0);
	i1 = max(i1, 0);

	MF4 a2 = conv2d_tf1.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0);
	MF4 b2 = conv2d_tf1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	MF4 c2 = conv2d_tf1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	MF4 d2 = conv2d_tf1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	MF4 e2 = conv2d_tf1.SampleLevel(sam, pos, 0);
	MF4 f2 = conv2d_tf1.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	MF4 g2 = conv2d_tf1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	MF4 h2 = conv2d_tf1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	MF4 i2 = conv2d_tf1.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0);

	MF4 na2 = max(-a2, 0);
	MF4 nb2 = max(-b2, 0);
	MF4 nc2 = max(-c2, 0);
	MF4 nd2 = max(-d2, 0);
	MF4 ne2 = max(-e2, 0);
	MF4 nf2 = max(-f2, 0);
	MF4 ng2 = max(-g2, 0);
	MF4 nh2 = max(-h2, 0);
	MF4 ni2 = max(-i2, 0);

	a2 = max(a2, 0);
	b2 = max(b2, 0);
	c2 = max(c2, 0);
	d2 = max(d2, 0);
	e2 = max(e2, 0);
	f2 = max(f2, 0);
	g2 = max(g2, 0);
	h2 = max(h2, 0);
	i2 = max(i2, 0);

	MF4 target1 = { -0.044073943, 0.12072677, -0.0022342638, -0.24414532 };
	target1 = MulAdd(a1, MF4x4(-0.04088509, -0.06585775, -0.3094732, 0.12059048, 0.041417453, -0.06144871, -0.06655134, 0.03308842, 0.09287731, 0.010969216, 0.10343026, -0.11185897, 0.05685865, -0.09490512, 0.040908635, 0.03501189), target1);
	target1 = MulAdd(b1, MF4x4(-0.04854754, -0.098667145, 0.67147833, -0.11299351, -0.022114437, -0.029202767, 0.014179382, 0.26027945, 0.22076549, -0.16490546, -0.0010764733, 0.08405975, 0.11849154, -0.19072372, -0.35719597, -0.059621073), target1);
	target1 = MulAdd(c1, MF4x4(0.079224996, 0.0669873, -0.1718969, -0.05002573, 0.044926763, -0.02904369, 0.017489236, 0.01144465, 0.059109706, 0.064998455, 0.14725484, -0.23879208, 0.039234288, -0.027365638, 0.26172164, -0.094598554), target1);
	target1 = MulAdd(d1, MF4x4(-0.07159218, -0.03181544, 0.113837324, -0.053089984, -0.098298974, -0.29500297, 0.1608509, -0.044355504, 0.050882854, -0.19417204, -0.080487266, -0.00879743, 0.0007914453, 0.16640955, 0.21786706, 0.16398212), target1);
	target1 = MulAdd(e1, MF4x4(0.16324541, 0.19753313, -0.46597233, -0.2593132, -0.2781038, -0.21973547, -0.024623038, -0.16348499, 0.3628299, 0.044888914, 0.04054647, -0.63605887, 0.02099492, 0.060544077, -0.49359834, 0.36336297), target1);
	target1 = MulAdd(f1, MF4x4(-0.16885692, 0.31907207, 0.020906802, 0.13290039, -0.037330728, -0.022859452, -0.020451576, -0.113437995, -0.085683934, 0.054102756, -0.044824492, 0.061346747, -0.038684413, 0.098444365, -0.06734984, -0.17084897), target1);
	target1 = MulAdd(g1, MF4x4(-0.015821548, -0.119599186, 0.1614827, 0.08383641, -0.07933593, 0.12528986, -0.06300182, 0.09286327, -0.10199266, 0.02419403, 0.0028411683, -0.09028338, 0.07962534, -0.08676035, -0.19237737, -0.115502626), target1);
	target1 = MulAdd(h1, MF4x4(0.09471972, 0.21153462, -0.14393018, 0.055180002, 0.1817461, 0.016607309, -0.0771979, 0.11181317, -0.5491086, -0.102757886, -0.20952754, 0.022466583, -0.075119644, -0.14725658, 0.38451517, 0.12920731), target1);
	target1 = MulAdd(i1, MF4x4(0.0867803, 0.114654355, 0.21199988, -0.15367955, -0.01803536, 0.056378633, 0.0018388306, 0.024613786, -0.13306658, 0.017211098, 0.073351346, -0.12064064, -0.10484361, -0.067748636, 0.033206712, -0.13061953), target1);
	target1 = MulAdd(a2, MF4x4(-0.002236411, -0.022144757, -0.04586377, 0.101181075, -0.03511624, 0.08440529, 0.18544284, -0.22786349, -0.042184375, 0.015734851, -0.038622506, 0.038529944, -0.09170703, 0.034527462, -0.07817406, 0.10547265), target1);
	target1 = MulAdd(b2, MF4x4(-0.12135524, -0.07412039, -0.04979351, -0.082267545, 0.13343571, 0.29196215, -0.4364121, -0.10226428, 0.060835477, -0.23307934, -0.018231759, 0.15550235, 0.09095689, 0.18164484, 0.1322021, -0.022567045), target1);
	target1 = MulAdd(c2, MF4x4(-0.0054531163, -0.039762255, -0.030490747, 0.04779882, -0.15290286, 0.056712102, -0.0776974, 0.04114215, 0.15946816, -0.03882117, 0.16770308, -0.026126247, -0.027203865, -0.064107865, -0.13670811, 0.1556276), target1);
	target1 = MulAdd(d2, MF4x4(-0.092548385, -0.027285473, 0.084179096, 0.014961629, 0.2564254, 0.07626849, 0.28534448, 0.2588713, -0.018600503, -0.2433456, 0.041392803, -0.045712482, 0.26388907, -0.053502295, 0.14522223, 0.032808404), target1);
	target1 = MulAdd(e2, MF4x4(-0.0013780193, 0.3482449, 0.071003586, -0.30707207, -0.05122194, -0.2833618, 0.07910779, 0.051078696, 0.021535402, 0.13021478, 0.022049015, -0.533547, 0.57265025, -0.12843914, -0.14913581, -0.1433724), target1);
	target1 = MulAdd(f2, MF4x4(0.07382619, -0.12152924, 0.13364957, 0.181974, 0.15804219, -0.10126773, 0.3029618, -0.12874149, 0.13743348, -0.23245592, -0.20119278, 0.029547188, 0.042436857, 0.04213892, -0.07975374, 0.023821082), target1);
	target1 = MulAdd(g2, MF4x4(0.022782229, -0.08359311, -0.060623147, 0.06565042, 0.09828792, 0.044808697, -0.28872305, -0.00092168007, 0.021737702, -0.08698349, 0.1950025, 0.07931995, 0.040952396, -0.07443172, -0.021157127, 0.0056698937), target1);
	target1 = MulAdd(h2, MF4x4(-0.09995892, -0.2047294, 0.1414849, 0.062335726, -0.22492298, 0.05269799, -0.029233055, -0.050517935, -0.12534393, -0.12194023, -0.07035469, -0.070764475, 0.18903446, 0.07691209, 0.06153371, 0.011280912), target1);
	target1 = MulAdd(i2, MF4x4(-0.036189888, -0.07586571, -0.05888163, 0.010425367, -0.028375402, -0.18870986, -0.19146784, 0.19274063, -0.18856238, 0.0064240266, -0.14537223, -0.06971656, 0.0852742, -0.04866623, -0.031686075, 0.031702038), target1);
	target1 = MulAdd(na1, MF4x4(0.0618941, 0.100858234, 0.2628019, -0.048507668, -0.051001363, -0.03195978, 0.035452217, -0.001991919, -0.09649028, -0.047445696, -0.09221298, 0.07602656, -0.02382384, -0.119645916, 0.085616075, -0.07076033), target1);
	target1 = MulAdd(nb1, MF4x4(0.019222878, -0.0491929, -0.4902266, 0.18501294, 0.014529614, -0.077125326, 0.011563931, -0.20236616, -0.101982154, -0.021150962, -0.07537948, -0.1540349, 0.028949164, -0.06827332, 0.0067634755, 0.09582376), target1);
	target1 = MulAdd(nc1, MF4x4(-0.05995539, -0.031138182, 0.01334257, 0.06827176, -0.030762246, 0.006615233, -0.03562788, 0.016249394, -0.14797118, 0.014671043, -0.09325859, 0.25653747, -0.11474991, 0.05436232, 0.031051394, 0.04179694), target1);
	target1 = MulAdd(nd1, MF4x4(0.032279838, -0.030521005, 0.0029688699, 0.005165139, 0.15907808, -0.20421815, -0.07713175, 0.067530625, -0.08619395, 0.026114263, 0.08821273, 0.011591694, 0.018677557, 0.057708874, -0.25859246, -0.18693781), target1);
	target1 = MulAdd(ne1, MF4x4(0.10823143, -0.31875235, -0.24394153, -0.0025489891, 0.016761065, -0.19857498, -0.07858479, -0.07811158, -0.38551694, -0.049090322, -0.050053325, 0.23398961, 0.014974165, 0.17498055, 0.29105362, -0.353647), target1);
	target1 = MulAdd(nf1, MF4x4(0.05621677, -0.19492444, 0.460332, 0.055917628, -0.06404381, -0.06684098, 0.053624872, 0.057300456, -0.019248677, -0.15110065, 0.032379635, -0.12673225, 0.0068658157, -0.13001235, -0.017716292, 0.064182095), target1);
	target1 = MulAdd(ng1, MF4x4(-0.06764552, 0.004707433, -0.13827331, -0.21957871, -0.03789028, -0.04962028, 0.022955444, -0.058468018, 0.13735814, -0.031270552, -0.018490225, 0.0063876202, -0.052979283, -0.030049473, -0.004811771, -0.0044099926), target1);
	target1 = MulAdd(nh1, MF4x4(-0.028652798, -0.027029367, 0.62600744, 0.0900562, 0.03869923, -0.20111556, 0.095930666, -0.13164565, 0.5562579, 0.011937122, 0.22882107, 0.030288015, 0.09856272, 0.04736032, -0.077492185, -0.10207275), target1);
	target1 = MulAdd(ni1, MF4x4(-0.10581002, -0.16504957, -0.5688921, 0.0414545, 0.04749444, -0.052849945, -0.011017121, -0.025284614, 0.14316759, -0.08547362, -0.09654446, 0.08682504, 0.050776027, 0.0678741, -0.04913651, 0.07527552), target1);
	target1 = MulAdd(na2, MF4x4(0.04126091, 0.0048704315, 0.041699376, -0.05820725, -0.09664279, 0.07648305, -0.17979898, 0.11698985, -0.025436765, 0.023232851, 0.010656572, 0.08157569, 0.19584864, -0.022928072, 0.053339157, 0.0039929505), target1);
	target1 = MulAdd(nb2, MF4x4(0.040733483, 0.12260473, 0.08071146, 0.07257762, -0.016945919, -0.31637576, -0.24281953, -0.0038469466, -0.10203634, 0.13631973, 0.06505259, -0.13119389, -0.09723076, -0.139551, -0.07504509, 0.08645985), target1);
	target1 = MulAdd(nc2, MF4x4(0.017005404, 0.049066268, -0.007544932, -0.04884536, 0.09984347, -0.04447364, 0.4902235, -0.062780835, -0.18389583, 0.07305648, -0.22014385, 0.08004685, 0.0992568, -0.08569604, 0.093966395, -0.07047139), target1);
	target1 = MulAdd(nd2, MF4x4(0.0017705248, 0.020553982, -0.09167042, 0.0036356782, -0.11867446, -0.07055574, 0.40252638, 0.09657129, 0.0888632, 0.1031708, -0.022127641, -0.023769693, -0.0861388, 0.13420185, -0.11774454, 0.038774434), target1);
	target1 = MulAdd(ne2, MF4x4(-0.15173717, -0.13590458, -0.0891863, 0.12289548, 0.13942605, 0.22152588, -0.19292432, 0.14169839, 0.010543665, 0.07648361, -0.057333756, 0.09535759, -0.053601623, -0.026824495, 0.09365424, 0.17476946), target1);
	target1 = MulAdd(nf2, MF4x4(-0.070416056, -0.061970036, -0.039723337, -0.18874651, -0.07098426, -0.019835865, -0.5612458, 0.060437083, -0.03774378, 0.18536821, 0.28587544, 0.035555754, 0.15771326, -0.13527197, 0.13342534, -0.06564073), target1);
	target1 = MulAdd(ng2, MF4x4(-0.10967661, 0.025388904, 0.09003177, -0.04087592, 0.09531671, -0.11809294, -0.41613623, 0.038198076, 0.01019813, -0.018864965, -0.18400626, -0.038704176, 0.0105671035, 0.024449013, -0.008989595, -0.027171193), target1);
	target1 = MulAdd(nh2, MF4x4(0.16193569, -0.21445285, -0.20130903, -0.13498883, -0.008031679, 0.050757203, 0.78938776, -0.03749514, 0.11998137, 0.19368882, 0.12328945, 0.0058578993, -0.13852382, -0.033867255, -0.018267661, 0.036348555), target1);
	target1 = MulAdd(ni2, MF4x4(-0.06254118, 0.087295115, 0.031116437, 0.0416281, 0.061828617, 0.34479564, -0.15537797, -0.17144552, 0.13989387, -0.13792284, 0.056215156, 0.12714528, -0.0198865, 0.04927947, 0.013614583, -0.041810013), target1);
	
	MF4 target2 = { -0.07235962, -0.019149294, 0.05072898, 0.03962245 };
	target2 = MulAdd(a1, MF4x4(0.07115729, 0.01065505, 0.19167988, -0.02504489, -0.15064801, 0.079008736, 0.05437936, 0.027479589, -0.021383656, 0.032731537, -0.06657876, 0.022649521, -0.06501893, -0.02335689, 0.010445489, -0.05430297), target2);
	target2 = MulAdd(b1, MF4x4(-0.1178601, 0.07425715, 0.063272275, -0.18308601, -0.13955134, 0.005166404, -0.022591779, -0.016827974, -0.024990188, -0.13372071, -0.056342285, 0.12489847, 0.081861794, -0.07083351, 0.021897513, 0.0629395), target2);
	target2 = MulAdd(c1, MF4x4(0.051357627, -0.13874975, -0.09887168, -0.011908862, 0.03639772, -0.13195883, -0.05321156, 0.03913229, -0.08160194, -0.07128151, 0.043625016, 0.11966009, 0.03162217, 0.018834392, -0.0625129, 0.10726711), target2);
	target2 = MulAdd(d1, MF4x4(-0.15922394, -0.043482754, -0.22571066, 0.009280428, -0.3882705, 0.08418719, 0.15329506, -0.028419001, -0.011272379, 0.15897545, 0.041217074, -0.0143014155, 0.09451862, -0.056342427, -0.14568482, 0.05556279), target2);
	target2 = MulAdd(e1, MF4x4(0.13879324, -0.23339099, -0.24573983, -0.09575104, 0.03823306, 0.4752516, -0.1696623, -0.18472373, -0.1510259, 0.23040254, 0.4196143, 0.3462817, 0.035172507, 0.18228662, 0.22475636, -0.19945027), target2);
	target2 = MulAdd(f1, MF4x4(-0.08876766, 0.19567333, 0.25174314, -0.09637879, -0.007957943, 0.13510521, 0.030193076, -0.0018362573, -0.006884444, -0.41804117, -0.1026309, -0.053339038, -0.1283198, -0.03033918, 0.055674326, 0.094377995), target2);
	target2 = MulAdd(g1, MF4x4(0.06780768, -0.07774435, -0.0616546, -0.046531744, -0.11723141, 0.10792474, 0.013314576, -0.031451598, -0.009870351, 0.10215877, -0.13101454, -0.19878799, -0.09712651, 0.10423937, 0.14170039, -0.03359521), target2);
	target2 = MulAdd(h1, MF4x4(-0.020114673, -0.015194169, 0.03657608, 0.17162928, 0.070458665, -0.08041664, 0.14067306, 0.19699603, -0.28763783, -0.033556152, -0.6588468, -0.48221052, -0.123711474, -0.080758795, -0.3187303, 0.121004865), target2);
	target2 = MulAdd(i1, MF4x4(-0.074900605, 0.09297913, -0.08621144, 0.116730206, -0.034766622, -0.10381484, 0.060793545, -0.014790814, -0.123858415, -0.0010626495, 0.20547503, -0.07206306, -0.17324795, 0.023932874, 0.017495958, -0.09924652), target2);
	target2 = MulAdd(a2, MF4x4(-0.015568068, 0.005394868, 0.15463537, 0.06416607, -0.045670815, -0.013540727, -0.12960619, 0.0006581649, 0.09432853, 0.05575682, -0.022219105, 0.022416297, 0.0148129435, -0.067619696, 0.022989385, -0.09695771), target2);
	target2 = MulAdd(b2, MF4x4(-0.107209, 0.07072438, -0.10235772, -0.12078849, -0.02751833, -0.043195058, -0.17197154, 0.120612316, -0.17310137, -0.09429793, 0.06511165, 0.18072544, -0.21168593, 0.16383737, 0.25012484, -0.089589044), target2);
	target2 = MulAdd(c2, MF4x4(0.005439779, 0.0028433986, -0.09885586, -0.06572956, -0.0061691296, 0.15485546, -0.23724958, 0.004232802, 0.07794742, -0.012552598, 0.07554404, 0.10843201, -0.013223918, -0.08705092, -0.23228747, 0.03599732), target2);
	target2 = MulAdd(d2, MF4x4(-0.043396916, -0.10680695, -0.019935586, -0.06703658, -0.30075943, -0.010179525, 0.30197874, 0.04888297, 0.00779067, 0.22583807, 0.2039884, -0.0074303118, -0.19240093, -0.024718538, 0.057117213, 0.19431825), target2);
	target2 = MulAdd(e2, MF4x4(-0.37633005, 0.043971814, -0.21423087, 0.118503235, -0.15058799, 0.115756795, -0.13719647, 0.020510519, 0.1123193, 0.14797291, 0.05467349, 0.2039607, -0.31973588, 0.1667847, -0.017739004, -0.11280262), target2);
	target2 = MulAdd(f2, MF4x4(-0.0084394775, -0.1281101, -0.20841378, 0.01986435, -0.04122467, -0.21089631, -0.08062371, 0.11315133, 0.05693114, -0.23773515, 0.03792205, -0.008872407, 0.04554895, -0.10683658, 0.10683206, 0.06875721), target2);
	target2 = MulAdd(g2, MF4x4(-0.103948504, -0.007483217, -0.12571928, 0.054868475, -0.030646881, -0.010098222, 0.019018777, -0.07072212, -0.10689893, 0.16498323, 0.048089568, -0.10912806, -0.027318537, -0.025491163, 0.012588013, 0.072701246), target2);
	target2 = MulAdd(h2, MF4x4(0.14094622, -0.028118243, 0.016804086, -0.18000692, 0.33351874, 0.14980756, -0.07135749, -0.16573106, -0.17243773, 0.054617904, -0.2933543, -0.12602285, 0.08480712, -0.05704333, 0.22336398, 0.026583148), target2);
	target2 = MulAdd(i2, MF4x4(0.046759557, -0.03100408, 0.40000245, -0.08521555, 0.19592628, -0.15150753, 0.25288078, -0.061794683, -0.047818147, -0.12249124, 0.020410215, -0.11503924, 0.046108168, 0.030459814, -0.14096366, 0.09120256), target2);
	target2 = MulAdd(na1, MF4x4(-0.087491795, -0.024289595, -0.09060237, 0.020922959, 0.09557061, -0.08556962, -0.0503455, -0.010846053, 0.0030694185, -0.008256591, 0.08290225, -0.034981687, 0.07342003, -0.021816112, -0.13905519, -0.06265962), target2);
	target2 = MulAdd(nb1, MF4x4(-0.08126147, -0.05866924, -0.015698025, 0.093630895, -0.02379264, 0.115918085, 0.19431724, 0.041815966, -0.051647816, 0.15277039, -0.03721037, -0.085520886, 0.041766718, 0.104392216, 0.0559556, 0.0049254233), target2);
	target2 = MulAdd(nc1, MF4x4(-0.11176419, 0.112272635, 0.1367475, -0.010482275, -0.06719008, 0.064003386, -0.08132314, 0.015465676, 0.052741583, 0.06779717, 0.038533892, -0.16428822, 0.040990274, 0.002559234, 0.097567044, -0.058192518), target2);
	target2 = MulAdd(nd1, MF4x4(0.17228632, 0.008296625, 0.009418271, 0.037103783, -0.0601486, 0.04531715, 0.19613501, 0.112170085, -0.02256726, -0.093685195, -0.1341531, -0.038480807, 0.109840475, 0.062418167, 0.15140085, 0.050787117), target2);
	target2 = MulAdd(ne1, MF4x4(0.15433665, 0.2104034, 0.12395812, 0.13799714, 0.14945604, 0.67457545, 0.27575177, -0.047493283, 0.24992993, -0.5305435, 0.0131732905, -0.36911693, 0.14442082, -0.18583177, -0.2861722, 0.19419897), target2);
	target2 = MulAdd(nf1, MF4x4(0.040242445, -0.13234852, 0.10056324, 0.055854917, 0.07447713, -0.023067042, 0.00021051937, -0.0495407, -0.22037992, 0.68047297, 0.05774606, -0.012461005, 0.104557075, 0.04832623, 0.010292581, -0.050617047), target2);
	target2 = MulAdd(ng1, MF4x4(-0.060079176, 0.086553656, 0.0060872175, -0.012576339, 0.025149338, -0.07379716, -0.18048704, -0.007130346, 0.007826557, -0.095655076, -0.0032888134, 0.21027069, -0.09868755, -0.1180311, 0.0081250835, -0.05078016), target2);
	target2 = MulAdd(nh1, MF4x4(0.19124818, -0.05949092, -0.36762074, -0.08203597, -0.10276991, 0.111005515, -0.2845309, 0.113985784, 0.07206471, -0.026585411, 0.20032002, 0.5691625, -0.0460136, 0.03874166, 0.09858682, -0.15913802), target2);
	target2 = MulAdd(ni1, MF4x4(-0.00397842, -0.014763085, 0.080231026, -0.09142265, 0.03637215, 0.064106315, -0.030963007, 0.0557953, 0.04173885, -0.024534896, -0.2092259, 0.06913638, 0.08103145, -0.0033994897, -0.10903093, 0.062850125), target2);
	target2 = MulAdd(na2, MF4x4(0.01206918, 0.024855271, -0.051995132, 0.013999821, -0.021517826, 0.06216198, -0.050853133, -0.064136736, -0.047408275, -0.07858566, 0.074464396, -0.038218755, -0.13216262, 0.008905726, 0.10333, 0.03049554), target2);
	target2 = MulAdd(nb2, MF4x4(-0.027152343, -0.069046065, -0.013017797, 0.0763, -0.08611993, -0.020867927, 0.012807627, -0.11971997, 0.025972975, 0.095127404, -0.070044935, -0.21399231, -0.22536097, -0.028828809, 0.123399965, -0.15967365), target2);
	target2 = MulAdd(nc2, MF4x4(0.038314234, -0.014114242, 0.012115026, 0.05505015, 0.11785298, -0.08772618, 0.034408223, 0.09134674, -0.04727011, 0.020709611, -0.01780165, -0.14374214, -0.30412516, -0.011123043, -0.024216317, -0.007538433), target2);
	target2 = MulAdd(nd2, MF4x4(-0.17673545, 0.077738725, 0.056153737, 0.028693894, 0.05688375, 0.021928595, 0.014585902, 0.019364892, 0.029056642, -0.2072201, -0.17548367, 0.085471265, 0.16439342, -0.0052957633, 0.22321554, -0.19246858), target2);
	target2 = MulAdd(ne2, MF4x4(0.1914782, -0.15620962, -0.16686897, -0.04141303, 0.07696967, -0.013115313, -0.057627093, -0.13849305, 0.08699377, -0.07339016, -0.053074118, -0.059418138, 0.19988623, -0.23852244, -0.12574267, -0.29139704), target2);
	target2 = MulAdd(nf2, MF4x4(-0.017691063, 0.18901291, 0.16250716, -0.11039392, 0.056900974, 0.036662772, -0.13399602, -0.11378214, -0.10924602, 0.2130181, -0.042094063, -0.012445028, 0.013168919, 0.119448364, -0.014406005, 0.0054324497), target2);
	target2 = MulAdd(ng2, MF4x4(0.11552786, 0.090796515, -0.11559005, -0.035706047, -0.044022456, -0.027642358, 0.08824298, 0.035067793, 0.18125483, -0.15502097, 0.094219126, 0.07493505, 0.022493582, 0.038250685, -0.076567575, -0.059311453), target2);
	target2 = MulAdd(nh2, MF4x4(-0.08612596, 0.016376335, -0.0023271537, 0.32511148, 0.03789289, 0.13106889, 0.22370385, 0.21145949, 0.1844514, -0.0766592, 0.093758754, 0.13821359, -0.062405586, 0.0028724174, -0.13588348, 0.00024406122), target2);
	target2 = MulAdd(ni2, MF4x4(-0.08991004, 0.074423954, -0.020964831, -0.070288494, -0.1192369, -0.015506713, -0.28136373, 0.042911243, 0.08215164, 0.11065419, -0.006201638, 0.057742044, 0.0014476188, -0.01443158, 0.22631277, -0.06787264), target2);

	conv2d_1_tf[gxy] = target1;
	conv2d_1_tf1[gxy] = target2;
}

//!PASS 3
//!DESC Conv-4x3x3x16
//!IN conv2d_1_tf, conv2d_1_tf1
//!OUT conv2d_2_tf, conv2d_2_tf1
//!BLOCK_SIZE 8
//!NUM_THREADS 64

void Pass3(uint2 blockStart, uint3 threadId) {
	uint2 gxy = Rmp8x8(threadId.x) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	float2 pos = (gxy + 0.5f) * inputPt;

	// [ a, d, g ]
	// [ b, e, h ]
	// [ c, f, i ]
	MF4 a1 = conv2d_1_tf.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0);
	MF4 b1 = conv2d_1_tf.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	MF4 c1 = conv2d_1_tf.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	MF4 d1 = conv2d_1_tf.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	MF4 e1 = conv2d_1_tf.SampleLevel(sam, pos, 0);
	MF4 f1 = conv2d_1_tf.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	MF4 g1 = conv2d_1_tf.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	MF4 h1 = conv2d_1_tf.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	MF4 i1 = conv2d_1_tf.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0);

	MF4 na1 = max(-a1, 0);
	MF4 nb1 = max(-b1, 0);
	MF4 nc1 = max(-c1, 0);
	MF4 nd1 = max(-d1, 0);
	MF4 ne1 = max(-e1, 0);
	MF4 nf1 = max(-f1, 0);
	MF4 ng1 = max(-g1, 0);
	MF4 nh1 = max(-h1, 0);
	MF4 ni1 = max(-i1, 0);

	a1 = max(a1, 0);
	b1 = max(b1, 0);
	c1 = max(c1, 0);
	d1 = max(d1, 0);
	e1 = max(e1, 0);
	f1 = max(f1, 0);
	g1 = max(g1, 0);
	h1 = max(h1, 0);
	i1 = max(i1, 0);

	MF4 a2 = conv2d_1_tf1.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0);
	MF4 b2 = conv2d_1_tf1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	MF4 c2 = conv2d_1_tf1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	MF4 d2 = conv2d_1_tf1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	MF4 e2 = conv2d_1_tf1.SampleLevel(sam, pos, 0);
	MF4 f2 = conv2d_1_tf1.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	MF4 g2 = conv2d_1_tf1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	MF4 h2 = conv2d_1_tf1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	MF4 i2 = conv2d_1_tf1.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0);

	MF4 na2 = max(-a2, 0);
	MF4 nb2 = max(-b2, 0);
	MF4 nc2 = max(-c2, 0);
	MF4 nd2 = max(-d2, 0);
	MF4 ne2 = max(-e2, 0);
	MF4 nf2 = max(-f2, 0);
	MF4 ng2 = max(-g2, 0);
	MF4 nh2 = max(-h2, 0);
	MF4 ni2 = max(-i2, 0);

	a2 = max(a2, 0);
	b2 = max(b2, 0);
	c2 = max(c2, 0);
	d2 = max(d2, 0);
	e2 = max(e2, 0);
	f2 = max(f2, 0);
	g2 = max(g2, 0);
	h2 = max(h2, 0);
	i2 = max(i2, 0);

	MF4 target1 = { -0.079253934, 0.001511763, 0.100159355, 0.01585197 };
	target1 = MulAdd(a1, MF4x4(0.14315613, -0.031299837, -0.011195234, 0.0073360316, 0.07264984, -0.110979274, 0.06560588, -0.040463638, 0.28964168, -0.05644335, -0.060729366, -0.15811591, 0.028339373, 0.027486937, 0.0360574, 0.05856459), target1);
	target1 = MulAdd(b1, MF4x4(0.16211128, 0.20672597, -0.30374205, -0.056202736, -0.10893948, 0.053066984, -0.18297112, 0.028844962, 0.22754766, -0.07141921, 0.07142953, -0.1357581, 0.008053467, 0.04668908, 0.17258649, 0.22506891), target1);
	target1 = MulAdd(c1, MF4x4(0.07014762, 0.032112304, 0.028849715, 0.09427007, 0.008323501, -0.085777245, 0.083501115, -0.16150802, 0.24127382, -0.1305689, -0.027557204, -0.15057805, 0.09748757, 0.08182083, -0.107643455, 0.020552907), target1);
	target1 = MulAdd(d1, MF4x4(-0.04630706, -0.056070503, 0.058440026, -0.005662525, 0.08736355, 0.08821088, -0.049539115, 0.08171937, 0.28466523, -0.025859421, -0.0026971614, -0.15181617, -0.022231927, 0.3566104, -0.024887348, 0.12051598), target1);
	target1 = MulAdd(e1, MF4x4(-0.20976813, -0.23778942, 0.28854275, -0.27583683, -0.27604774, -0.15861328, 0.09581984, 0.06572128, 0.092306405, -0.06962751, -0.042226445, 0.035234913, 0.084891975, -0.03846841, -0.1473667, 0.2810354), target1);
	target1 = MulAdd(f1, MF4x4(0.028011162, 0.08945262, 0.15859836, 0.18426442, 0.10649845, -0.0918649, -0.12257575, -0.00914911, 0.23701023, -0.030067213, -0.01938559, -0.11026175, -0.5953985, 0.28875506, -0.035278864, -0.05043055), target1);
	target1 = MulAdd(g1, MF4x4(-0.14445779, -0.06907616, 0.13078876, -0.0089114, -0.110637166, -0.123719245, -0.094949, 0.046267383, 0.4727523, 0.0073195575, -0.014788787, -0.14922102, -0.021974785, -0.10706751, 0.00049029186, 0.09215345), target1);
	target1 = MulAdd(h1, MF4x4(-0.20936993, -0.22377276, -0.07697398, 0.039161056, 0.044213686, 0.037542075, -0.06600642, 0.017124292, 0.3406197, 0.011907687, 0.019732054, -0.22745137, -0.22178015, 0.49051985, -0.03707166, 0.14849792), target1);
	target1 = MulAdd(i1, MF4x4(0.07833466, 0.10888627, 0.16015877, -0.049263358, 0.29002127, -0.010949114, 0.013081097, -0.071674205, 0.3532135, 0.013165473, -0.05282189, -0.16688257, 0.009552089, -0.2740816, 0.04927233, -0.37047002), target1);
	target1 = MulAdd(a2, MF4x4(0.23682123, -0.027914839, 0.02372468, -0.07127212, 0.053436097, 0.057737537, -0.008556659, -0.025973454, 0.06468388, 0.18805866, -0.08180048, 0.058999106, -0.3058321, -0.06642967, -0.092997625, 0.10527466), target1);
	target1 = MulAdd(b2, MF4x4(-0.1353085, -0.016593851, 0.21518163, -0.10272456, 0.14382689, 0.05056661, -0.27799338, 0.11351653, 0.05838342, 0.28104934, -0.03777824, 0.003435516, 0.057915565, -0.17574134, -0.24437475, 0.13420977), target1);
	target1 = MulAdd(c2, MF4x4(0.13400255, -0.056437124, 0.11310834, 0.040429913, 0.098928474, -0.020769242, -0.079605736, 0.0494632, 0.0660877, 0.098982334, -0.055884495, -0.046533633, 0.17815505, 0.027310565, -0.24176653, -0.025550256), target1);
	target1 = MulAdd(d2, MF4x4(0.03637618, -0.012618673, 0.11865397, 0.19804053, -0.03522831, 0.24310908, -0.056454524, -0.44885796, 0.02212509, -0.20253624, 0.038810212, -0.17396528, 0.08970355, 0.005103078, 0.061075203, 0.44292897), target1);
	target1 = MulAdd(e2, MF4x4(-0.25074747, -0.0015575301, -0.685015, 0.07345307, -0.08419402, 0.06640714, 0.43799296, -0.17571151, 0.0049855476, 0.09024738, 0.055744022, 0.018739637, 0.34734032, 0.114896655, 0.0404696, -0.11327049), target1);
	target1 = MulAdd(f2, MF4x4(-0.12284062, -0.31131, -0.14712588, -0.18645866, 0.17581487, 0.1357234, 0.09913364, 0.005298711, -0.056155153, 0.042429443, 0.039454732, -0.04111384, 0.2623163, 0.09701166, 0.022825675, 0.050480727), target1);
	target1 = MulAdd(g2, MF4x4(0.058734808, 0.038528245, -0.042670116, -0.15190329, -0.028179986, -0.05362995, 0.017090468, -0.24449602, -0.08240927, -0.033122182, 0.009938243, -0.0052937623, 0.2171439, 0.06879817, -0.10361997, 0.018995138), target1);
	target1 = MulAdd(h2, MF4x4(0.027555468, 0.016337285, 0.19074728, 0.26690376, -0.088713005, -0.0021182299, -0.23062791, -0.32101163, -0.0040022335, 0.16835448, 0.05424022, -0.02156396, 0.24163729, 0.10243619, -0.04331782, -0.014350939), target1);
	target1 = MulAdd(i2, MF4x4(-0.13836963, 0.053369813, 0.036432605, 0.062288612, -0.06264361, -0.049093347, -0.0315955, -0.11237456, -0.064744405, -0.0151798045, 0.044210885, 0.010166375, -0.038355727, -0.05203739, -0.075036794, 0.1664177), target1);
	target1 = MulAdd(na1, MF4x4(-0.08583114, 0.08268218, -0.05771351, 0.10195048, -0.10128163, 0.10874855, -0.02580701, 0.028834302, 0.1950179, -0.0130183315, 0.0092119705, -0.060479227, 0.117747106, 0.061403573, -0.0028475628, -0.032362986), target1);
	target1 = MulAdd(nb1, MF4x4(-0.05310153, -0.061091065, 0.19438389, -0.10475873, 0.00045120303, -0.24876194, 0.017168125, -0.050173752, 0.012073283, 0.035660096, -0.017562328, -0.110271364, -0.015546384, 0.17965329, 0.10068208, -0.014481325), target1);
	target1 = MulAdd(nc1, MF4x4(0.085558474, -0.0007109211, 0.20868625, 0.150163, -0.19283043, 0.025976779, 0.08384698, 0.031011146, 0.17268184, 0.008871077, -0.04097794, -0.12868725, 0.01336166, -0.038823843, 0.1703644, -0.067780636), target1);
	target1 = MulAdd(nd1, MF4x4(0.06480841, -0.44256654, -0.19949587, -0.030677497, -0.27930573, -0.041867044, -0.15648738, 0.11573067, 0.28664824, 0.009770385, -0.058617204, -0.06607673, -0.038160402, 0.009497089, 0.03303058, -0.079379834), target1);
	target1 = MulAdd(ne1, MF4x4(0.17752203, 0.10979527, -0.058749028, -0.30194217, 0.30484176, -0.20980492, -0.05234784, -0.2590473, 0.23003183, 0.21903595, -0.024891363, -0.14337292, -0.02971356, -0.29613075, -0.045642294, 0.23826689), target1);
	target1 = MulAdd(nf1, MF4x4(0.018211683, -0.005840598, -0.19021381, -0.096696235, 0.39998052, -0.34746838, -0.039627917, 0.087701194, 0.15526368, -0.008095372, -0.044220537, -0.08634815, -0.121496454, -0.06792033, -0.14959472, 0.078917444), target1);
	target1 = MulAdd(ng1, MF4x4(0.33109078, 0.012287281, -0.034155898, -0.04840956, 0.068748444, 0.006142039, 0.06598935, 0.024775596, 0.22379673, 0.056089353, -0.006119644, -0.018509025, 0.10084137, 0.15556572, -0.041211523, -0.21550669), target1);
	target1 = MulAdd(nh1, MF4x4(-0.058160853, 0.08899222, -0.17401625, -0.1449813, -0.015872562, -0.03780256, 0.15702572, 0.34013954, 0.1580772, 0.074823864, 0.035488904, -0.01627819, -0.15551315, -0.3638866, -0.09833458, 0.15037175), target1);
	target1 = MulAdd(ni1, MF4x4(-0.12707977, -0.19947061, -0.11524648, 0.09216174, -0.07161296, 0.05675567, 0.06843247, 0.2803306, 0.25222927, -0.044076066, 0.053775772, -0.09939824, 0.16903089, 0.11475717, -0.07015584, -0.036021322), target1);
	target1 = MulAdd(na2, MF4x4(-0.12290332, -0.05469477, 0.02696626, 0.051133692, -0.05541504, -0.2811521, -0.13008943, 0.031793896, -0.32529324, -0.01663752, -0.0658181, 0.17300756, 0.22281154, -0.11001508, 0.09578194, -0.055437982), target1);
	target1 = MulAdd(nb2, MF4x4(0.083753526, -0.048933715, -0.13912897, 0.10929772, -0.1789828, -0.1586524, -0.10964165, -0.08210391, -0.11568187, -0.04813496, -0.2417861, 0.24446528, 0.13570863, -0.26869404, 0.3013413, 0.11678686), target1);
	target1 = MulAdd(nc2, MF4x4(0.21105368, 0.15749952, -0.18983693, -0.023642758, -0.1633653, 0.10107988, 0.052329395, -0.080253236, 0.15375629, -0.045091413, 0.05070866, 0.12416106, 0.16600485, -0.10412354, 0.061849747, -0.084013924), target1);
	target1 = MulAdd(nd2, MF4x4(0.03863923, 0.03690167, -0.053106382, -0.07523278, -0.04214836, 0.53898096, 0.15308584, 0.22835171, -0.24771535, 0.1402687, 0.1000896, -0.08719167, 0.0886567, 0.15255097, 0.14695966, -0.06659865), target1);
	target1 = MulAdd(ne2, MF4x4(0.110334344, -0.12696493, 0.24256139, 0.02536166, 0.08322421, 0.022147777, -0.35030407, 0.13734557, 0.053133942, 0.43650532, -0.30170345, 0.08751837, 0.012917502, 0.27496436, 0.11422729, 0.15508565), target1);
	target1 = MulAdd(nf2, MF4x4(0.16684863, 0.26743406, 0.15951683, 0.033597723, -0.044719726, 0.1127182, 0.007923161, 0.06415458, -0.07269362, -0.07828715, 0.09216738, 0.11528897, -0.13371283, -0.124177165, 0.14804523, 0.14156726), target1);
	target1 = MulAdd(ng2, MF4x4(-0.041141883, 0.023617791, 0.11484465, 0.13131519, -0.14753738, 0.17067687, -0.017538434, 0.24042644, -0.058103643, 0.3143255, 0.02476919, -0.0024666793, -0.26759955, -0.06099211, 0.006415725, 0.10394301), target1);
	target1 = MulAdd(nh2, MF4x4(-0.04198037, 0.03277123, -0.25069895, -0.21043587, -0.27417016, 0.08047665, 0.29731026, 0.07629813, -0.15695353, -0.14299184, 0.026618432, 0.13265325, 0.07727133, 0.12872085, 0.13887435, 0.1347057), target1);
	target1 = MulAdd(ni2, MF4x4(0.039232086, 0.117847264, -0.071643315, -0.040677182, -0.029160816, -0.06968689, 0.12880929, 0.037579957, -0.036671028, -0.022678757, -0.069731854, 0.10590314, 0.028034678, -0.015759282, 0.047180142, -0.16366881), target1);
	
	MF4 target2 = { 0.019374544, -0.050425697, -0.005817216, -0.0059976326 };
	target2 = MulAdd(a1, MF4x4(0.024126908, 0.01737047, 0.04563732, 0.08303721, -0.21339902, 0.00025652428, -0.09666459, -0.07654246, -0.01201168, 0.14373912, 0.22268519, 0.049181588, -0.0751725, 0.006847365, -0.025867194, 0.19233267), target2);
	target2 = MulAdd(b1, MF4x4(-0.25251204, -0.34213448, -0.0022676045, 0.29270738, 0.08876456, 0.067294724, 0.2865476, -0.009144941, -0.074606106, 0.14566834, 0.14162645, 0.10980335, -0.7958991, -0.15410729, 0.038512416, -0.17033637), target2);
	target2 = MulAdd(c1, MF4x4(-0.115404196, -0.11004134, 0.13174473, -0.0006875606, 0.0051814034, 0.058522645, -0.0795437, 0.0011465811, -0.019500278, 0.12752724, 0.16985136, -0.054932587, 0.16734739, -0.04686017, -0.072241016, 0.054562975), target2);
	target2 = MulAdd(d1, MF4x4(-0.07528159, -0.113516726, 0.2081102, 0.009942251, 0.08256535, 0.050133914, 0.012745932, 0.13902397, 0.009369715, 0.083261885, 0.17366019, 0.069754004, 0.030654406, -0.045856245, -0.055254143, 0.16265897), target2);
	target2 = MulAdd(e1, MF4x4(-0.14366727, 0.24948351, 0.12160293, 0.10929859, -0.116071545, -0.11725494, -0.13926856, -0.026759636, 0.12723772, 0.1938045, -0.02745115, -0.0644584, -0.23854719, 0.059308372, -0.446269, -0.06978486), target2);
	target2 = MulAdd(f1, MF4x4(0.21108554, -0.1717225, 0.066633105, 0.15418948, -0.08902029, 0.047925282, 0.15817304, -0.080941506, 0.007364865, 0.10506626, 0.20205018, -0.078695655, 0.14004812, -0.3195092, 0.19157887, -0.12697977), target2);
	target2 = MulAdd(g1, MF4x4(-0.08145032, -0.14292753, 0.066565305, -0.061348185, -0.08738346, 0.011608093, -0.0024047727, -0.024741996, -0.11547277, 0.10013328, 0.21730538, 0.05598899, -0.17741105, 0.075944185, 0.027434295, -0.2550598), target2);
	target2 = MulAdd(h1, MF4x4(-0.026223006, 0.11214396, -0.133987, 0.1303522, 9.32011e-05, -0.14755996, -0.14002979, -0.039624512, 0.045111652, 0.17618611, 0.17764348, 0.104528464, 0.20592515, 0.07240335, -0.27604735, 0.038880046), target2);
	target2 = MulAdd(i1, MF4x4(0.17734227, -0.002935363, 0.07505682, -0.029969893, -0.024536638, 0.11236127, 0.119374484, 0.08002781, -0.003541722, 0.1428466, 0.1729824, 0.055412393, -0.04790376, 0.18020035, 0.05376964, -0.1520942), target2);
	target2 = MulAdd(a2, MF4x4(-0.11352182, -0.019249126, 0.10782615, 0.03079928, 0.020381734, -0.08998433, -0.09211494, -0.054406203, 0.1828849, -0.07692097, 0.004733955, -0.026685018, -0.08044814, -0.071961075, 0.029184176, -0.22562811), target2);
	target2 = MulAdd(b2, MF4x4(-0.34489468, -0.07447471, 0.026422959, 0.33550653, 0.22130035, 0.059709545, -0.07646962, -0.18386386, 0.33911958, -0.07534871, 0.040870134, 0.051136248, 0.32681262, 0.20612194, -0.1609581, -0.70460784), target2);
	target2 = MulAdd(c2, MF4x4(0.27617922, 0.09758603, 0.05103887, -0.09281693, -0.007143339, 0.006635712, -0.055270564, -0.022629099, -0.13023081, -0.013819027, -0.038695697, 0.047280338, -0.13964762, 0.09852924, -0.10056262, -0.084967695), target2);
	target2 = MulAdd(d2, MF4x4(0.1370323, 0.030904075, -0.033860117, 0.08926374, -0.14616281, -0.29926816, -0.23738252, -0.21374625, -0.14039646, 0.11503669, 0.082101606, -0.061717354, 0.021357644, -0.10676707, 0.03214661, 0.029967157), target2);
	target2 = MulAdd(e2, MF4x4(-0.29881296, -0.22195289, -0.3512607, -0.2277441, 0.033705913, -0.23267402, -0.119738854, -0.18925253, 0.068823405, -0.15160555, 0.2585695, 0.10484223, -0.012574211, 0.38808516, 0.2599094, -0.4991424), target2);
	target2 = MulAdd(f2, MF4x4(-0.07474731, 0.22742131, 0.014462262, 0.08409484, 0.09579643, -0.0519534, 0.0007793075, -0.044820115, -0.010144471, -0.040506937, 0.0056340825, 0.057767954, -0.14988829, -0.05099549, 0.007204364, -0.07094934), target2);
	target2 = MulAdd(g2, MF4x4(-0.05736621, 0.12072876, -0.02037183, 0.05012334, -0.1173538, -0.10062993, -0.0033958228, 0.0142556345, -0.011005385, -0.0066177617, -0.058390465, 0.048240293, 0.09835053, 0.17917523, -0.06466951, 0.017518612), target2);
	target2 = MulAdd(h2, MF4x4(0.1413101, -0.30268928, -0.17851736, -0.10797371, -0.01964573, 0.14356858, -0.06759965, 0.17416531, 0.13905385, -0.017476829, 0.06541924, -0.044690568, -0.080723755, -0.08610206, 0.095347285, -0.09233214), target2);
	target2 = MulAdd(i2, MF4x4(-0.07254187, -0.091158785, 0.018472971, 0.03514051, 0.018888336, 0.107934274, -0.018830854, 0.10007211, -0.053966418, -0.035646267, -0.031214178, -0.05980228, -0.13045661, -0.011743741, -0.03325275, 0.071065165), target2);
	target2 = MulAdd(na1, MF4x4(-0.037697386, 0.054388218, -0.010934479, 0.2266702, 0.049999133, 0.017648092, -0.044225454, 0.21611899, -0.03805845, 0.054236397, -0.018563407, -0.060588073, -0.031215845, 0.075081706, 0.07333242, -0.09651128), target2);
	target2 = MulAdd(nb1, MF4x4(-0.32236508, -0.0026381002, -0.30787975, 0.2963127, -0.13276175, 0.1058753, -0.12744896, 0.09749292, -0.02683677, -0.0041124597, 0.006103888, -0.09997201, 0.092101686, -0.08375288, 0.09641652, 0.053333007), target2);
	target2 = MulAdd(nc1, MF4x4(0.027999232, -0.060004722, -0.009207874, -0.0952888, -0.038418446, -0.13316345, 0.099323496, 0.048450433, 0.0443969, 0.056023613, 0.1156147, 0.018980766, 0.040020484, 0.07555044, 0.0039174426, -0.044098593), target2);
	target2 = MulAdd(nd1, MF4x4(-0.101029314, 0.33333415, -0.22052327, -0.035329416, 0.17229559, 0.12564908, -0.07879576, -0.09248896, -0.03239869, 0.022611454, 0.05610472, -0.02181683, -0.06347532, -0.077292696, 0.02005389, -0.078899406), target2);
	target2 = MulAdd(ne1, MF4x4(-0.028139396, -0.04349171, -0.019393284, 0.42110333, 0.37065667, 0.5282552, 0.43816927, 0.19155908, 0.051832534, 0.02050813, 0.030795977, 0.023960136, -0.27617985, 0.19165507, -0.005492024, -0.13349663), target2);
	target2 = MulAdd(nf1, MF4x4(5.0700226e-05, 0.21293098, -0.39902148, -0.058406413, -0.06766975, 0.1129277, -0.012398328, 0.025031524, 0.03519656, 0.06486415, 0.15710293, 0.014098051, 0.057754945, 0.116186336, -0.14429826, 0.051864166), target2);
	target2 = MulAdd(ng1, MF4x4(-0.012280755, 0.043744788, -0.06420968, 0.012739398, 0.043073926, 0.031230433, 0.00036492705, -0.039208546, -0.09329152, 0.06928111, 0.11622664, -0.009106846, 0.111528054, -0.020315262, 0.036427997, 0.15881014), target2);
	target2 = MulAdd(nh1, MF4x4(-0.066635534, 0.13901882, 0.0885122, 0.1030835, 0.08539728, -0.015466482, 0.0706688, -0.1611047, 0.02179479, -0.00048529037, 0.08708685, -0.00894464, -0.13046473, -0.21456988, -0.20666413, 0.049039323), target2);
	target2 = MulAdd(ni1, MF4x4(-0.100800075, -0.03772198, -0.095183305, -0.15150243, -0.08743059, -0.24299338, -0.019315414, -0.1574107, -0.013610722, 0.064871654, 0.058439128, 0.008972897, 0.10339555, -0.027356634, 0.07666196, 0.048524544), target2);
	target2 = MulAdd(na2, MF4x4(0.046309173, -0.03858991, -0.13260359, 0.0017626585, 0.1453724, 0.1402359, -0.079240486, 0.13017912, 0.0629575, -0.15448172, -0.1856442, -0.044694453, -0.17226808, -0.08065212, -0.008038736, -0.15994963), target2);
	target2 = MulAdd(nb2, MF4x4(0.18369722, 0.03849556, -0.035185467, -0.20205377, 0.03879293, 0.02712859, -0.051278092, 0.14862835, 0.10261192, 0.18085574, -0.025982017, -0.029160796, 0.5301373, 0.09614058, 0.35518438, -0.014906588), target2);
	target2 = MulAdd(nc2, MF4x4(-0.31154996, -0.06868871, -0.012681131, 0.028093819, -0.37321633, -0.14738804, 0.06060776, 0.050054748, 0.013779029, -0.020390315, -0.12487434, -0.0029474346, -0.274524, -0.09142805, 0.0132142445, 0.1577639), target2);
	target2 = MulAdd(nd2, MF4x4(-0.02177336, -0.020817943, -0.0111796055, -0.0046033757, 0.45033064, 0.3573757, 0.55279994, 0.602122, -0.05536106, -0.33642644, -0.1851379, -0.052192084, 0.03683446, 0.13613251, 0.20098919, -0.090587094), target2);
	target2 = MulAdd(ne2, MF4x4(0.1520822, 0.37173554, -0.061298244, 0.0019386727, 0.44656134, 0.13406622, 0.39018136, 0.5722051, -0.13074401, 0.012778576, -0.2837446, 0.16098566, 0.100189455, -0.40386122, 0.17464107, -0.17862785), target2);
	target2 = MulAdd(nf2, MF4x4(-0.01217905, -0.24295084, 0.08192982, -0.14160301, -0.05936872, -0.003312342, -0.07542139, 0.13488367, -0.21560493, -0.14342502, -0.19195864, -0.09448305, -0.1038431, -0.075766176, 0.03226791, 0.06455397), target2);
	target2 = MulAdd(ng2, MF4x4(-0.076916575, -0.10891301, 0.032635316, 0.03848802, 0.15750243, 0.48169684, 0.5410635, 0.017279895, 0.012730932, -0.0059071835, 0.030766146, -0.0225503, -0.030178519, -0.05866621, 0.033593398, -0.00033098995), target2);
	target2 = MulAdd(nh2, MF4x4(-0.10757409, 0.2644168, -0.025696747, -0.0077012815, 0.31728277, 0.29771668, 0.2443613, -0.047722775, -0.083712585, -0.12742844, -0.3138776, -0.059888497, 0.12291351, -0.14435866, 0.051414594, -0.11889901), target2);
	target2 = MulAdd(ni2, MF4x4(-0.063888945, 0.002844068, -0.06129518, 0.03381495, 0.10176077, -0.11625004, -0.10745763, -0.20636752, -0.03820934, 0.01926402, -0.20310643, 0.09767577, -0.00776684, 0.13453315, -0.036967937, 0.09780335), target2);

	conv2d_2_tf[gxy] = target1;
	conv2d_2_tf1[gxy] = target2;
}

//!PASS 4
//!DESC Conv-4x3x3x16
//!IN conv2d_2_tf, conv2d_2_tf1
//!OUT conv2d_3_tf, conv2d_3_tf1
//!BLOCK_SIZE 8
//!NUM_THREADS 64

void Pass4(uint2 blockStart, uint3 threadId) {
	uint2 gxy = Rmp8x8(threadId.x) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	float2 pos = (gxy + 0.5f) * inputPt;

	// [ a, d, g ]
	// [ b, e, h ]
	// [ c, f, i ]
	MF4 a1 = conv2d_2_tf.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0);
	MF4 b1 = conv2d_2_tf.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	MF4 c1 = conv2d_2_tf.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	MF4 d1 = conv2d_2_tf.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	MF4 e1 = conv2d_2_tf.SampleLevel(sam, pos, 0);
	MF4 f1 = conv2d_2_tf.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	MF4 g1 = conv2d_2_tf.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	MF4 h1 = conv2d_2_tf.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	MF4 i1 = conv2d_2_tf.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0);

	MF4 na1 = max(-a1, 0);
	MF4 nb1 = max(-b1, 0);
	MF4 nc1 = max(-c1, 0);
	MF4 nd1 = max(-d1, 0);
	MF4 ne1 = max(-e1, 0);
	MF4 nf1 = max(-f1, 0);
	MF4 ng1 = max(-g1, 0);
	MF4 nh1 = max(-h1, 0);
	MF4 ni1 = max(-i1, 0);

	a1 = max(a1, 0);
	b1 = max(b1, 0);
	c1 = max(c1, 0);
	d1 = max(d1, 0);
	e1 = max(e1, 0);
	f1 = max(f1, 0);
	g1 = max(g1, 0);
	h1 = max(h1, 0);
	i1 = max(i1, 0);

	MF4 a2 = conv2d_2_tf1.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0);
	MF4 b2 = conv2d_2_tf1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	MF4 c2 = conv2d_2_tf1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	MF4 d2 = conv2d_2_tf1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	MF4 e2 = conv2d_2_tf1.SampleLevel(sam, pos, 0);
	MF4 f2 = conv2d_2_tf1.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	MF4 g2 = conv2d_2_tf1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	MF4 h2 = conv2d_2_tf1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	MF4 i2 = conv2d_2_tf1.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0);

	MF4 na2 = max(-a2, 0);
	MF4 nb2 = max(-b2, 0);
	MF4 nc2 = max(-c2, 0);
	MF4 nd2 = max(-d2, 0);
	MF4 ne2 = max(-e2, 0);
	MF4 nf2 = max(-f2, 0);
	MF4 ng2 = max(-g2, 0);
	MF4 nh2 = max(-h2, 0);
	MF4 ni2 = max(-i2, 0);

	a2 = max(a2, 0);
	b2 = max(b2, 0);
	c2 = max(c2, 0);
	d2 = max(d2, 0);
	e2 = max(e2, 0);
	f2 = max(f2, 0);
	g2 = max(g2, 0);
	h2 = max(h2, 0);
	i2 = max(i2, 0);

	MF4 target1 = { 0.052450567, 0.10404023, -0.059578225, 0.009724121 };
	target1 = MulAdd(a1, MF4x4(-0.028328063, 0.038015317, 0.14893384, 0.10103896, 0.028176744, -0.02067147, -0.10970998, 0.015726602, -0.07402682, -0.075281784, -0.012586929, 0.053476278, 0.14823362, 0.20312984, 0.24213, 0.039118115), target1);
	target1 = MulAdd(b1, MF4x4(0.009731573, -0.019011121, 0.016360838, -0.0073153526, 0.14594506, -0.0427664, -0.094225354, -0.013891855, -0.037061375, 0.024959227, -0.12289382, -0.21792257, -0.33579424, 0.052678566, 0.04346115, 0.07943186), target1);
	target1 = MulAdd(c1, MF4x4(0.0022269129, 0.013298362, -0.045071404, -0.007918287, 0.010860651, -0.073057, -0.0042394064, 0.03340809, 0.04938919, -0.024218693, -0.008147567, 0.08848061, -0.06840333, 0.10077341, -0.272586, -0.06542769), target1);
	target1 = MulAdd(d1, MF4x4(0.15929016, -0.1415167, 0.057084452, 0.06830724, 0.0046992986, 0.068573505, 0.22142749, -0.18493174, -0.1006019, -0.11373546, 0.17520057, -0.12888812, 0.05176946, -0.14703397, -0.20610721, 0.16611591), target1);
	target1 = MulAdd(e1, MF4x4(-0.0069309813, 0.22358349, -0.18569049, 0.13456121, -0.21528137, 0.04618922, -0.47261322, -0.09682613, 0.5402922, 0.15818685, 0.05288464, -0.09949312, 0.21833964, 0.06652228, -0.2694682, 0.58216536), target1);
	target1 = MulAdd(f1, MF4x4(0.040808782, 0.023110595, 0.12678777, -0.09057271, 0.03159572, 0.044006016, -0.10090222, 0.09940838, -0.08454473, -0.118349984, -0.053009644, 0.24352531, -0.103818566, 0.12536442, -0.17832974, 0.25161982), target1);
	target1 = MulAdd(g1, MF4x4(-0.026323501, -0.14911288, -0.0073903934, 0.06902844, 0.07188603, -0.05006621, 0.06539817, -0.048083752, -0.08032579, -0.07449341, -0.015944218, 0.032426495, 0.069349505, -0.07441237, 0.055614363, 0.065174624), target1);
	target1 = MulAdd(h1, MF4x4(-0.046432327, -0.051616143, 0.017791865, -0.047294978, 0.025944458, -0.0020909954, 0.083794415, -0.055740435, -0.3720184, 0.06654654, 0.1944378, 0.07806658, 0.00870193, 0.005404396, -0.059417505, -0.06270168), target1);
	target1 = MulAdd(i1, MF4x4(-0.011699918, -0.03260685, 0.016413182, -0.02199741, -0.042898953, -0.018734168, -0.12387174, 0.06405199, -0.050764065, 0.07050078, 0.006969675, 0.05508108, -0.079687595, 0.12154926, 0.071177684, 0.046873443), target1);
	target1 = MulAdd(a2, MF4x4(-0.2158498, 0.03612371, -0.05268691, -0.065594874, 0.06997431, -0.07327132, -0.03323361, -0.23306306, -0.00011140713, -0.1891967, -0.017328357, 0.15796778, -0.061359044, 0.008202449, -0.031317197, -0.020873578), target1);
	target1 = MulAdd(b2, MF4x4(-0.022816254, -0.014594548, 0.0064240466, 0.07976367, -0.0070318123, -0.07651652, -0.111756384, -0.2788498, 0.16634429, -0.1583179, -0.10245271, 0.10480152, 0.345051, -0.07809675, 0.046080578, -0.32139245), target1);
	target1 = MulAdd(c2, MF4x4(0.020630263, 0.032152038, 0.0019161701, 0.05435833, 0.078139454, -0.10090956, 0.14244889, 0.017286595, 0.0039871824, -0.026395446, 0.14158171, 0.0010112645, 0.17055373, -0.08093189, -0.049234428, -0.33473247), target1);
	target1 = MulAdd(d2, MF4x4(-0.10982378, 0.029386539, -0.15483, -0.04345961, -0.16869037, -0.30862433, 0.060743757, -0.032285906, 0.017884266, -0.09846199, -0.090971105, -0.1693697, -0.063690096, -0.08489718, 0.18247683, -0.19921213), target1);
	target1 = MulAdd(e2, MF4x4(0.1898742, 0.22187345, -0.28495324, -0.42578775, 0.12833633, -0.2251503, -0.025917793, 0.6011678, -0.36586264, 0.23302059, -0.072634645, 0.0064221635, 0.56792957, -0.4684677, 0.05015159, 0.30121225), target1);
	target1 = MulAdd(f2, MF4x4(0.10837159, 0.14743729, -0.03639783, -0.34797576, -0.18306817, -0.07957882, -0.111433275, 0.30104128, -0.102763996, -0.01020151, 0.016333267, -0.012390819, 0.11835027, -0.12597388, -0.006298998, 0.0513052), target1);
	target1 = MulAdd(g2, MF4x4(-0.23662986, 0.23325302, -0.046104953, 0.36488137, 0.06990537, -0.06887873, -0.012611426, -0.02618366, -0.05296669, 0.195254, 0.016366778, 0.01693462, -0.08488424, -0.24656284, -0.035283253, -0.15318634), target1);
	target1 = MulAdd(h2, MF4x4(0.061704446, -0.26930714, -0.24598889, 0.27657726, 0.05046488, -0.341884, 0.10704377, -0.15971762, 0.072999336, -0.2005826, -0.05874223, -0.053938035, -0.08284583, -0.22792995, 0.1027033, -0.012932447), target1);
	target1 = MulAdd(i2, MF4x4(-0.029079054, 0.14774945, 0.026151389, 0.12380946, 0.08926635, -0.08387116, -0.17018612, -0.09304499, 0.086990625, -0.27579373, 0.003823722, -0.024723161, 0.08762848, -0.10080674, -0.012214886, -0.30239874), target1);
	target1 = MulAdd(na1, MF4x4(-0.25756493, 0.2537789, 0.21723714, 0.0017929028, -0.014724892, 0.086692676, 0.11934202, -0.025869334, 0.008618976, -0.0046638376, -0.06863114, -0.07598151, -0.17309345, 0.009138105, -0.099874064, 0.07377463), target1);
	target1 = MulAdd(nb1, MF4x4(-0.39971545, 0.16774859, 0.13102596, 0.30735064, -0.060374007, -0.036933452, 0.14408773, 0.06479284, 0.03806265, 0.045560133, 0.043136165, -0.019244662, 0.17573427, -0.11398941, -0.0751098, 0.041702736), target1);
	target1 = MulAdd(nc1, MF4x4(-0.074492976, 0.18349282, -0.009050458, 0.0869807, -0.23123743, -0.015426683, -0.14346392, 0.005445149, -0.05322614, 0.10287576, 0.16083732, -0.09557319, -0.13891783, -0.13752605, -0.023572346, 0.13608918), target1);
	target1 = MulAdd(nd1, MF4x4(-0.31140685, 0.40130782, 0.07704675, 0.27509958, 0.09711739, -0.18293281, -0.14500841, -0.15334702, 0.098314695, 0.22749798, 0.006017282, -0.013669673, 0.07147038, 0.022289474, -0.036797456, -0.0013958871), target1);
	target1 = MulAdd(ne1, MF4x4(0.0547557, -0.03036202, 0.65113044, 0.10668893, 0.304707, -0.1456157, 0.27668485, 0.2279428, -0.42439902, -0.0073047588, 0.045635667, 0.271637, -0.19595222, -0.28107607, 0.3905438, -0.29898256), target1);
	target1 = MulAdd(nf1, MF4x4(0.076843366, 0.037181348, 0.08652873, 0.1756985, 0.03728033, -0.22783624, 0.16810594, -0.022009399, 0.16058537, 0.24559903, 0.05266939, -0.13929726, 0.15964857, 0.0013167082, 0.015017631, 0.101646364), target1);
	target1 = MulAdd(ng1, MF4x4(-0.3022452, 0.20052882, 0.13433233, 0.04250016, -0.15248592, 0.014216527, -0.23489903, 0.13919333, 0.22891816, -0.0053335144, -0.05567782, -0.12769286, -0.05337762, -0.11429989, -0.00882089, -0.030790573), target1);
	target1 = MulAdd(nh1, MF4x4(-0.11763547, 0.1073185, 0.15810886, 0.013149736, -0.028268294, -0.24712053, 0.08592036, 0.075742744, 0.19626461, -0.10880887, -0.22599675, -0.37207767, -0.032548983, -0.011045266, -0.035218395, 0.099996395), target1);
	target1 = MulAdd(ni1, MF4x4(0.05631665, 0.029538663, 0.043909863, 0.13720988, 0.10980592, -0.047748722, 0.080308706, -0.06828442, 0.1144396, -0.12510885, -0.067976676, 0.030742755, 0.07134681, -0.06652312, -0.0642328, -0.034490924), target1);
	target1 = MulAdd(na2, MF4x4(0.019588284, -0.15197967, -0.16797094, -0.026324488, 0.014429439, -0.028491383, 0.059453625, 0.23443304, 0.02504347, 0.08872467, 0.032782357, -0.085310735, 0.013040259, -0.09837991, 0.073533125, -0.03544458), target1);
	target1 = MulAdd(nb2, MF4x4(0.02198588, -0.09614766, 0.024655875, 0.025384603, 0.012162857, 0.065071434, 0.018112874, 0.19828922, -0.33289856, 0.011323505, 0.13696423, 0.31772846, -0.06587399, -0.05569957, -0.16469179, -0.22545892), target1);
	target1 = MulAdd(nc2, MF4x4(-0.009093827, 0.086783886, 0.060070645, 0.049957857, 0.041628215, 0.082412794, 0.117729135, -0.178277, 0.08326062, -0.07120824, 0.1788718, 0.050748438, -0.08952197, -0.14609487, 0.05515471, 0.14784457), target1);
	target1 = MulAdd(nd2, MF4x4(-0.10823147, -0.05108019, 0.092807196, -0.13899301, 0.19123949, -0.044189975, 0.0030145745, 0.08935499, -0.10338727, 0.01996205, 0.15671325, -0.08229972, 0.05603653, 0.043324884, 0.13562247, -0.11487494), target1);
	target1 = MulAdd(ne2, MF4x4(-0.18872134, -0.07302765, 0.030137405, 0.30928415, -0.07689583, 0.045998566, 0.45554903, -0.1653404, 0.14705873, -0.10649596, 0.46833125, 0.17608039, -0.43967086, 0.056812476, -0.17908083, -0.40455228), target1);
	target1 = MulAdd(nf2, MF4x4(-0.08093384, 0.032636635, 0.124594346, 0.13655491, 0.16780408, -1.4671803e-05, 0.13044862, -0.397665, -0.013273644, 0.08253894, 0.16302188, -0.052874118, 0.04073075, -0.18063635, -0.00838661, -0.31084144), target1);
	target1 = MulAdd(ng2, MF4x4(0.06804371, -0.14755388, -0.12055216, -0.00437858, -0.044694718, 0.22744909, 0.012434794, 0.06245207, 0.00560859, -0.15815294, -0.19711316, 0.07711764, 0.03078979, -0.09560189, 0.10509056, 0.010651465), target1);
	target1 = MulAdd(nh2, MF4x4(-0.026342146, 0.13919179, -0.0030414977, 0.06607403, 0.071292974, 0.065464914, -0.027091878, 0.10620255, -0.052090824, 0.06840278, -0.08457357, 0.08867469, 0.2976581, -0.6702739, -0.15472057, -0.3066263), target1);
	target1 = MulAdd(ni2, MF4x4(-0.00072869845, 0.046573937, -0.08363707, 0.07867379, 0.038065, 0.01228845, 0.031746328, -0.024448024, -0.065555945, 0.1220454, 0.032151606, -0.022336006, -0.0010816467, -0.026455112, 0.112422734, -0.10285581), target1);
	
	MF4 target2 = { 0.022815697, 0.012251767, 0.045309987, -0.0879881 };
	target2 = MulAdd(a1, MF4x4(-0.037506457, -0.06573841, -0.087879084, -0.06359248, -0.0017873603, -0.009097742, 0.010108622, 0.026364084, 0.012306545, 0.12607974, -0.088268295, 0.14034338, 0.24951904, 0.0983314, 0.03635719, -0.047059253), target2);
	target2 = MulAdd(b1, MF4x4(-0.05570699, 0.11044774, 0.04827364, -0.03185735, -0.032498132, -0.062959515, 0.2933071, 0.22244357, 0.061075654, 0.0064111133, 0.011452209, 0.11576761, 0.13969804, 0.20502032, 0.1114938, 0.022496287), target2);
	target2 = MulAdd(c1, MF4x4(-0.054194342, 0.000389916, -0.039589155, -0.018707246, -0.036095835, -0.06873059, -0.077109694, 0.028726012, -0.08820959, -0.109247595, -0.05745309, 0.043230128, 0.033671502, 0.16398554, 0.030398889, -0.17000203), target2);
	target2 = MulAdd(d1, MF4x4(-0.09218165, -0.12813722, -0.040984686, -0.016605416, 0.054269493, 0.12971285, -0.013961638, -0.17803082, -0.014683587, 0.2502267, -0.14249405, -0.025687713, -0.097426265, -0.30111355, -0.21776466, 0.008809217), target2);
	target2 = MulAdd(e1, MF4x4(0.21033873, 0.15221386, 0.18138756, -0.08248389, -0.10091519, -0.06940753, -0.014009188, -0.3009861, -0.02452202, -0.08800422, -0.36376888, 0.18485394, 0.35076657, -0.13293292, 0.24624826, 0.39373755), target2);
	target2 = MulAdd(f1, MF4x4(0.014170062, -0.029623963, 0.057001226, 0.09269898, -0.14630881, -0.16557585, 0.06735037, -0.015008042, -0.27238864, 0.081130914, -0.07869508, 0.098087415, 0.11217335, 0.48223323, 0.18613088, -0.035602476), target2);
	target2 = MulAdd(g1, MF4x4(-0.21623239, -0.1125095, -0.09964635, 0.101452544, 0.11877652, 0.13471957, -0.10402355, 0.0077938605, 0.030518647, 0.22309083, -0.2115206, 0.017967062, -0.042780407, 0.099759325, -0.10465051, -0.033807248), target2);
	target2 = MulAdd(h1, MF4x4(-0.059608232, 0.06684556, 0.00039066386, 0.08542961, 0.097183906, -0.1868667, 0.07778909, -0.06172202, 0.0021662437, -0.05387577, -0.4077133, -0.028940776, 0.110816136, -0.04154161, 0.030078325, 0.072834246), target2);
	target2 = MulAdd(i1, MF4x4(-0.01881586, -0.06384429, -0.054874837, -0.016731417, -0.06570834, -0.13579571, 0.0033891131, -0.059161015, -0.11559389, 0.02149361, -0.08791608, -0.008113861, 0.08313892, -0.07327947, -0.013473171, 0.13254371), target2);
	target2 = MulAdd(a2, MF4x4(-0.11458958, -0.08827364, -0.025030116, 0.12626298, 0.0070429775, 0.0337767, 0.051719055, -0.09654129, -0.04867615, -0.03609001, -0.06522421, -0.044131942, -0.048825134, 0.10652733, -0.015310965, -0.07341175), target2);
	target2 = MulAdd(b2, MF4x4(0.05782829, 0.014247012, 0.12126171, 0.100055166, 0.24079333, -0.20155986, 0.1640186, -0.12158374, -0.153708, -0.24445893, -0.10536192, 0.12758626, -0.19430119, -0.019024884, -0.080120996, -0.29866305), target2);
	target2 = MulAdd(c2, MF4x4(-0.017357074, 0.04390695, 0.12889594, 0.11451521, 0.03333342, -0.16417275, 0.10196121, 0.13059081, 0.09948873, 0.15007107, 0.22664218, 0.35449567, -0.089776486, 0.025239054, 0.12463201, -0.13109131), target2);
	target2 = MulAdd(d2, MF4x4(0.064875744, 0.40551752, 0.11903257, 0.14822967, 0.14993542, -0.12758526, 0.23159283, -0.06080246, -0.084577255, 0.14307548, -0.02186462, 0.05793564, -0.050965074, 0.23895216, -0.07796932, -0.1624384), target2);
	target2 = MulAdd(e2, MF4x4(-0.15942748, 0.07191155, 0.42204422, 0.35219797, 0.23286703, -0.283381, -0.2749432, 0.25922084, 0.10494953, 0.14575887, -0.19649154, -0.14563714, -0.03709703, 0.023375817, -0.05610175, -0.32548484), target2);
	target2 = MulAdd(f2, MF4x4(-0.04872624, -0.3592348, -0.027413938, 0.0836858, 0.046842758, -0.35193914, 0.06154142, 0.05559191, -0.22538327, -0.097689696, -0.21317257, -0.033945527, -0.23628096, -0.016477302, 0.027297588, -0.04105733), target2);
	target2 = MulAdd(g2, MF4x4(0.11543502, -0.043297376, 0.118703, 0.15013209, 0.03191795, 0.014122794, 0.05156918, 0.023102578, 0.0808462, -0.06445798, 0.15860644, -0.062393136, -0.018691704, -0.00032888897, 0.01196705, -0.025045555), target2);
	target2 = MulAdd(h2, MF4x4(0.08301664, 0.12298539, 0.20151077, 0.2993159, 0.16968682, -0.18196446, -0.13322797, -0.13693243, -0.0048389523, -0.057406515, 0.21409932, -0.060822334, -0.08554752, -0.19363636, -0.35241908, -0.32256603), target2);
	target2 = MulAdd(i2, MF4x4(-0.0523748, 0.17082025, 0.08556144, 0.19181536, -0.2445756, -0.3616732, -0.01641404, -0.078599006, 0.23907976, 0.025989126, 0.07574993, -0.06859337, -0.06667767, -0.022847861, -0.037942342, -0.21112117), target2);
	target2 = MulAdd(na1, MF4x4(0.15098672, 0.024212115, -0.19068481, -0.22606348, -0.15221487, -0.032165635, -0.06244531, -0.043535717, -0.07398802, -0.06088507, -0.013834592, -0.10145823, 0.06901983, -0.0862135, -0.05545454, 0.15514566), target2);
	target2 = MulAdd(nb1, MF4x4(0.044767097, -0.07583697, -0.17739761, -0.25538698, 0.0966659, -0.0013492911, -0.23315248, -0.21652249, -0.14381947, 0.017784966, -0.15960035, -0.13297895, 0.009810349, -0.041348267, 0.05443229, 0.17781278), target2);
	target2 = MulAdd(nc1, MF4x4(-0.0052824756, 0.087268956, -0.022167318, -0.09450279, 0.1254372, 0.075806946, 0.028893303, -0.09019378, 0.03488572, 0.046265777, 0.026162563, 0.003914548, -0.0632334, -0.19494742, -0.03602023, 0.113897055), target2);
	target2 = MulAdd(nd1, MF4x4(-0.11311528, 0.2616239, 0.12303548, 0.13427438, -0.26537886, 0.015112677, -0.03641703, -0.014114427, -0.023280613, 0.03626403, 0.12833157, 0.19168468, 0.2119137, -0.02374797, 0.117919676, 0.07794395), target2);
	target2 = MulAdd(ne1, MF4x4(-0.13746078, 0.25739196, 0.008431936, -0.053867325, -0.13228695, -0.20661803, 0.026474724, 0.3205188, -0.41819036, 0.42812085, 0.17249924, -0.15810613, 0.39602605, -0.10873597, 0.1457145, -0.060503867), target2);
	target2 = MulAdd(nf1, MF4x4(0.03706167, -0.036211733, 0.06519942, -0.2123978, 0.019934088, 0.17494182, -0.017252771, -0.067341134, -0.15416612, -0.114118524, -0.00028491023, -0.08172238, -0.11722721, -0.2647645, 0.13316637, 0.13562322), target2);
	target2 = MulAdd(ng1, MF4x4(0.11832847, 0.22822993, 0.020318847, 0.0734738, -0.025950216, -0.072782144, 0.11133989, 0.18845533, -0.004584898, -0.10486471, 0.054522812, -0.14136603, 0.01940983, -0.039433163, 0.008390286, 0.013686628), target2);
	target2 = MulAdd(nh1, MF4x4(-0.042335663, 0.0035399816, -0.1813205, -0.25639042, 0.1042524, 0.07707001, -0.04922454, 0.18140413, -0.22322963, 0.030809738, -0.11041754, -0.040288754, 0.09431559, -0.08017892, -0.18317147, -0.019331435), target2);
	target2 = MulAdd(ni1, MF4x4(-0.061776266, 0.0069793356, 0.019964112, -0.14504445, -0.00070097746, -0.027107855, 0.030182542, -0.05625612, -0.04958449, 0.123165295, 0.0013953283, 0.017912487, 0.031161329, -0.31798717, 0.018331604, 0.030411277), target2);
	target2 = MulAdd(na2, MF4x4(-0.0530594, -0.07933117, 0.024755973, 0.004785411, 0.045512546, 0.12833083, 0.023195961, -0.018028054, 0.014223835, 0.102213494, 0.052169293, -0.020509718, 0.017905682, 0.021354724, -0.0410789, -0.066523656), target2);
	target2 = MulAdd(nb2, MF4x4(0.017061293, -0.08770806, -0.04889939, 0.01825556, -0.03228951, -0.06838898, -0.09249373, 0.18103507, 0.087000825, 0.04175679, -0.09305919, -0.2792485, 0.03405797, 0.062147446, -0.04757652, -0.021603985), target2);
	target2 = MulAdd(nc2, MF4x4(-0.04115162, 0.02547615, 0.07033616, 0.09814065, 0.2597489, -0.0335038, 0.14097647, 0.047022782, 0.1374654, -0.27390274, 0.02080897, -0.15251215, -0.025431091, 0.08871465, -0.22243279, -0.07792812), target2);
	target2 = MulAdd(nd2, MF4x4(-0.061674852, -0.051326606, -0.04885301, 0.08548189, -0.07100394, 0.044875987, -0.19810183, -0.09841128, -0.06628199, -0.041564234, 0.1111919, -0.044448826, 0.06980301, 0.00046094303, -0.045978926, -0.20736355), target2);
	target2 = MulAdd(ne2, MF4x4(-0.18405268, -0.28115878, -0.33536536, 0.0753763, 0.028309148, 0.0014874876, 0.28369543, -0.2133985, 0.16520546, 0.29562506, 0.109781906, 0.028433772, -0.02691105, -0.39038795, -0.12942268, -0.080103286), target2);
	target2 = MulAdd(nf2, MF4x4(-0.05387814, -0.04672615, 0.046064686, 0.2791977, 0.11359623, -0.204098, -0.018091407, 0.13550591, 0.04216003, -0.1631328, -0.043013666, -0.045698896, 0.032403514, 0.010206319, -0.25789943, -0.36328712), target2);
	target2 = MulAdd(ng2, MF4x4(0.11280466, 0.11671405, -0.02122692, 0.021664057, -0.07836575, 0.014747725, 0.030007286, -0.10128616, -0.13695373, -0.10353946, -0.043571353, 0.05922437, -0.11293257, 0.0828006, -0.07322761, -0.08197273), target2);
	target2 = MulAdd(nh2, MF4x4(-0.0010509897, -0.1674067, 0.08191839, 0.056608744, 0.061343428, 0.19574693, 0.05302967, -0.006813754, -0.016064182, 0.22949885, -0.06631832, 0.034382205, 0.12674272, 0.06583508, 0.19319807, 0.011400221), target2);
	target2 = MulAdd(ni2, MF4x4(-0.032175347, -0.021227444, -0.027698517, 0.067299634, 0.23929007, 0.20669897, 0.004856941, 0.0009404045, 0.04919408, 0.020296812, 0.012571405, -0.16185577, -0.012276781, 0.16609742, -0.15718406, -0.20344186), target2);

	conv2d_3_tf[gxy] = target1;
	conv2d_3_tf1[gxy] = target2;
}

//!PASS 5
//!DESC Conv-4x3x3x16
//!IN conv2d_3_tf, conv2d_3_tf1
//!OUT conv2d_4_tf, conv2d_4_tf1
//!BLOCK_SIZE 8
//!NUM_THREADS 64

void Pass5(uint2 blockStart, uint3 threadId) {
	uint2 gxy = Rmp8x8(threadId.x) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	float2 pos = (gxy + 0.5f) * inputPt;

	// [ a, d, g ]
	// [ b, e, h ]
	// [ c, f, i ]
	MF4 a1 = conv2d_3_tf.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0);
	MF4 b1 = conv2d_3_tf.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	MF4 c1 = conv2d_3_tf.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	MF4 d1 = conv2d_3_tf.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	MF4 e1 = conv2d_3_tf.SampleLevel(sam, pos, 0);
	MF4 f1 = conv2d_3_tf.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	MF4 g1 = conv2d_3_tf.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	MF4 h1 = conv2d_3_tf.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	MF4 i1 = conv2d_3_tf.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0);

	MF4 na1 = max(-a1, 0);
	MF4 nb1 = max(-b1, 0);
	MF4 nc1 = max(-c1, 0);
	MF4 nd1 = max(-d1, 0);
	MF4 ne1 = max(-e1, 0);
	MF4 nf1 = max(-f1, 0);
	MF4 ng1 = max(-g1, 0);
	MF4 nh1 = max(-h1, 0);
	MF4 ni1 = max(-i1, 0);

	a1 = max(a1, 0);
	b1 = max(b1, 0);
	c1 = max(c1, 0);
	d1 = max(d1, 0);
	e1 = max(e1, 0);
	f1 = max(f1, 0);
	g1 = max(g1, 0);
	h1 = max(h1, 0);
	i1 = max(i1, 0);

	MF4 a2 = conv2d_3_tf1.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0);
	MF4 b2 = conv2d_3_tf1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	MF4 c2 = conv2d_3_tf1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	MF4 d2 = conv2d_3_tf1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	MF4 e2 = conv2d_3_tf1.SampleLevel(sam, pos, 0);
	MF4 f2 = conv2d_3_tf1.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	MF4 g2 = conv2d_3_tf1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	MF4 h2 = conv2d_3_tf1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	MF4 i2 = conv2d_3_tf1.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0);

	MF4 na2 = max(-a2, 0);
	MF4 nb2 = max(-b2, 0);
	MF4 nc2 = max(-c2, 0);
	MF4 nd2 = max(-d2, 0);
	MF4 ne2 = max(-e2, 0);
	MF4 nf2 = max(-f2, 0);
	MF4 ng2 = max(-g2, 0);
	MF4 nh2 = max(-h2, 0);
	MF4 ni2 = max(-i2, 0);

	a2 = max(a2, 0);
	b2 = max(b2, 0);
	c2 = max(c2, 0);
	d2 = max(d2, 0);
	e2 = max(e2, 0);
	f2 = max(f2, 0);
	g2 = max(g2, 0);
	h2 = max(h2, 0);
	i2 = max(i2, 0);

	MF4 target1 = { 0.08911729, -0.027969634, -0.010653148, -0.08001697 };
	target1 = MulAdd(a1, MF4x4(0.010501252, -0.046741538, -0.0017120431, -0.04840009, 0.20547974, 0.3366821, -0.10182207, 0.17451541, -0.03404171, -0.15138055, 0.16771653, -0.07168161, 0.102572344, 0.08266354, 0.20205829, 0.13429944), target1);
	target1 = MulAdd(b1, MF4x4(0.05584234, 0.06844309, 0.025430907, 0.124140054, 0.36385667, 0.12099467, -0.41671994, 0.085477844, 0.19748127, -0.21473993, 0.005037813, -0.3973761, 0.04669592, -0.100342326, -0.09403772, -0.034248166), target1);
	target1 = MulAdd(c1, MF4x4(-0.17654696, 0.009085064, 0.028360577, 0.033909567, 0.09377573, 0.27896938, 0.103994116, 0.0008595595, 0.064523555, 0.040994007, -0.06337235, 0.05662917, 0.0037455747, 0.017608117, -0.14610702, 1.2175746e-05), target1);
	target1 = MulAdd(d1, MF4x4(-0.04631749, -0.14251712, -0.16420849, -0.16259338, 0.46187812, 0.17576592, 0.00049142196, 0.029193122, -0.003925961, -0.11218227, 0.007026237, -0.20583045, -0.0010964901, 0.19355829, 0.2221649, 0.1187224), target1);
	target1 = MulAdd(e1, MF4x4(-0.041567978, -0.31510913, 0.01618704, 0.04979329, 0.101294376, 0.16356954, 0.21361789, 0.20735294, 0.1900854, -0.4151726, -0.30471593, -0.59483325, 0.033624128, 0.11495109, -0.15194787, 0.4920959), target1);
	target1 = MulAdd(f1, MF4x4(-0.18910064, -0.06516878, -0.20508374, -0.063928686, 0.7289614, 0.26674315, 0.2929481, 0.4026098, -0.033123735, -0.090371035, -0.029094126, -0.15197921, -0.08723726, -0.060160585, -0.07908409, -0.08826931), target1);
	target1 = MulAdd(g1, MF4x4(-0.08321312, -0.09749648, -0.08783197, -0.23072585, 0.24343425, 0.10888949, 0.17419606, 0.04136083, 0.0066000987, -0.06112787, -0.12176007, -0.20907228, -0.0008522778, -0.054704696, -0.07197735, -0.0877179), target1);
	target1 = MulAdd(h1, MF4x4(-0.40559706, -0.3801705, 0.05970925, -0.6157092, 0.28944594, 0.1252121, 0.403247, -0.122819394, -0.096336536, -0.2324694, 0.05980106, -0.19970767, -0.16646989, -0.10164633, -0.09282806, -0.08897996), target1);
	target1 = MulAdd(i1, MF4x4(-0.14336498, -0.12967408, -0.016268672, -0.021431219, -0.0850116, 0.37105832, -0.04093888, 0.08540873, 0.035717323, -0.07282701, -0.009123291, -0.0036565473, -0.02508944, -0.087611906, 0.03604423, -0.00089080486), target1);
	target1 = MulAdd(a2, MF4x4(0.1373875, 0.05283984, -0.11992707, 0.102294855, 0.3305128, 0.044920854, 0.31622922, -0.04711731, 0.001336024, 0.022799017, -0.062343203, 0.017140022, -0.07556853, -0.12864219, -0.25721326, -0.20741239), target1);
	target1 = MulAdd(b2, MF4x4(0.22062224, 0.09266222, 0.22466063, 0.18527372, -0.06940306, 0.1317168, 0.019784274, -0.07422301, 0.04061616, 0.0022494853, 0.21723995, 0.24732308, 0.14088804, 0.0116154915, 0.102064446, 0.020701224), target1);
	target1 = MulAdd(c2, MF4x4(-0.025154127, 0.045180723, -0.05877639, -0.099235624, 0.13630918, 0.24653725, -0.05723323, -0.022995364, -0.10826078, 0.049667366, 0.12618053, 0.1557369, 0.037487056, -0.22215757, 0.005912914, -0.20549043), target1);
	target1 = MulAdd(d2, MF4x4(0.09641055, 0.098845296, -0.08192096, -0.03691394, -0.18450394, 0.29955688, -0.082493715, -0.06268039, -0.0754319, 0.21018648, -0.016580105, -0.1810546, 0.13857666, -0.0327626, 0.03161804, -0.32589525), target1);
	target1 = MulAdd(e2, MF4x4(-0.18272439, -0.17595461, 0.047229152, 0.14596708, 0.40453747, 0.5658558, -0.17969102, 0.21557859, -0.34232348, 0.40355968, 0.53874254, 0.0012561383, 0.28154096, -0.06745097, -0.13049632, 0.42997465), target1);
	target1 = MulAdd(f2, MF4x4(0.081179485, -0.0041369614, -0.12001932, -0.102107175, -0.050293338, 0.29165673, 0.08062538, 0.22925815, 0.19389379, 0.28463286, -0.057207666, 0.23133168, -0.07545728, 0.06729763, -0.103593476, 0.014468794), target1);
	target1 = MulAdd(g2, MF4x4(0.069821335, -0.010299579, 0.069458775, 0.03894593, -0.054688405, 0.32758355, 0.13935772, 0.37506017, 0.24083133, -0.06105339, 0.25636867, 0.09627044, 0.08939188, 0.006728639, 0.10629504, 0.07887502), target1);
	target1 = MulAdd(h2, MF4x4(0.10563019, 0.077379815, 0.045456886, 0.09303406, 0.11326298, 0.28762257, -0.35142374, 0.10285745, 0.28762287, 0.3592446, 0.23816557, 0.22676824, 0.030372012, -0.028023086, -0.30956736, -0.27588373), target1);
	target1 = MulAdd(i2, MF4x4(0.110499, 0.009828844, 0.086689755, 0.1839749, 0.16656482, 0.083707325, 0.19506347, -0.01547141, 0.13804145, 0.2206598, -0.16484791, -0.0021595939, -0.06844408, -0.07861768, 0.040771082, -0.13347322), target1);
	target1 = MulAdd(na1, MF4x4(0.02667995, 0.019265587, -0.18211095, -0.102116466, -0.042541366, -0.07700912, -0.020587347, -0.03532171, 0.14816427, -0.1672272, -0.17522137, -0.04657808, 0.013430233, -0.0021270285, 0.109880306, 0.004838907), target1);
	target1 = MulAdd(nb1, MF4x4(0.14285165, -0.1364756, 0.017568532, -0.27690783, -0.015461915, 0.045437083, 0.018187419, 0.12473493, 0.17991658, -0.15642665, 0.10009151, -0.19040193, 0.1734127, -0.13817501, 0.0710856, -0.12921426), target1);
	target1 = MulAdd(nc1, MF4x4(-0.14114712, -0.18893671, 0.16121174, 0.035988737, 0.17872387, -0.106395856, -0.23183517, 0.012380416, 0.043066982, -0.28539032, -0.049011275, -0.21125022, -0.11976977, -0.015564958, 0.18880925, -0.0034812456), target1);
	target1 = MulAdd(nd1, MF4x4(-0.05894521, 0.17266215, -0.0458901, 0.08049924, 0.0156061025, -0.0047465423, 0.09714626, 0.045990974, -0.08786066, -0.37803304, -0.19629405, -0.08546443, 0.014874948, 0.16931784, 0.24799919, 0.06316819), target1);
	target1 = MulAdd(ne1, MF4x4(-0.28352743, 0.29973608, -0.014540065, 0.2865005, 0.048086923, 0.18976144, 0.22969759, 0.1643124, -0.11259408, -0.107592925, 0.184308, 0.30998367, -1.0860825, -0.29118305, -0.51242536, -0.38492215), target1);
	target1 = MulAdd(nf1, MF4x4(-0.17199941, -0.14274743, -0.14213641, -0.1691383, -0.17294803, -0.013992068, -0.12135059, 0.082377024, -0.11255549, -0.124990575, -0.32526177, -0.08199375, -0.25591666, 0.1882329, 0.07895415, 0.22012262), target1);
	target1 = MulAdd(ng1, MF4x4(0.026025832, -0.07267515, 0.09738688, 0.074536435, -0.060470507, -0.037861936, 0.0507819, -0.054857653, 0.0043173633, -0.18107842, -0.02996759, 0.04072402, -0.012617744, 0.061665237, 0.0013981885, 0.08679919), target1);
	target1 = MulAdd(nh1, MF4x4(0.27913737, 0.39656082, 0.1579819, 0.2774727, -0.007996453, 0.08704765, -0.016933938, 0.07066135, 0.12361742, -0.20802726, -0.13705719, -0.18794124, 0.037409827, -0.03351758, -0.2970392, -0.11001984), target1);
	target1 = MulAdd(ni1, MF4x4(-0.027419567, 0.043236237, -0.19843115, -0.056489736, -0.017010912, 0.070949584, -0.14881176, -0.0780235, 0.0039477753, -0.16772608, -0.009547604, -0.14060417, 0.0103197545, 0.07129672, 0.034949142, 0.014112084), target1);
	target1 = MulAdd(na2, MF4x4(-0.06467971, 0.084101565, 0.26296136, 0.08878442, -0.11232121, -0.054373942, -0.17263442, 0.046408508, 0.032239515, 0.042490713, 0.036938053, -0.034339923, -0.07139367, 0.032505415, 0.0045828503, 0.24428385), target1);
	target1 = MulAdd(nb2, MF4x4(0.053585388, -0.08175568, -0.04787236, 0.06061965, -0.0740297, 0.11113596, -0.12467945, 0.08229154, -0.01941305, 0.12903687, 0.09095716, -0.13062255, -0.0102068605, 0.107291475, 0.030279635, 0.07464777), target1);
	target1 = MulAdd(nc2, MF4x4(0.11041978, -0.0123585425, 0.11147018, 0.07380536, 0.06632908, 0.011784447, 0.029638765, -0.01566135, 0.009105331, 0.05252663, -0.17972581, 0.01210126, -0.10749957, -0.028144639, -0.105761215, 0.083784826), target1);
	target1 = MulAdd(nd2, MF4x4(-0.058018316, 0.15083058, 0.2725673, 0.024263225, -0.067711554, 0.051117413, -0.31144425, -0.15761986, 0.017503206, -0.14361219, -0.38261738, -0.20354146, -0.04211545, 0.12921454, -0.01319619, 0.35809723), target1);
	target1 = MulAdd(ne2, MF4x4(-0.107978396, 0.3230084, -0.13806303, 0.12903036, 0.039864987, -0.006241628, 0.18701774, -0.10785807, 0.30056882, -0.3092082, -0.4273322, 0.3784662, -0.026107281, 0.23165871, 0.35258314, -0.06654702), target1);
	target1 = MulAdd(nf2, MF4x4(-0.15840323, 0.15210885, 0.04086692, 0.19169305, 0.11847602, 0.0009038581, 0.095951624, 0.043941673, 0.1512248, 0.0749449, -0.027045414, -0.19729601, 0.08265063, -0.045218006, -0.10732461, 0.05197371), target1);
	target1 = MulAdd(ng2, MF4x4(0.13637526, 0.28841978, 0.10298119, -0.005948496, 0.020897362, -0.02186902, -0.16207378, -0.021084815, 0.029192554, 0.07076991, -0.07210881, -0.06752328, 0.0006557475, 0.08986717, -0.29430988, 0.21411087), target1);
	target1 = MulAdd(nh2, MF4x4(0.18667863, 0.3117322, -0.0859705, -0.038189936, 0.10214859, -0.11244034, 0.2680223, -0.072901204, -0.07434324, -0.17855306, 0.23134363, -0.055360887, -0.020968167, 0.0858459, 0.078975916, 0.13254759), target1);
	target1 = MulAdd(ni2, MF4x4(-0.15676941, 0.03476677, -0.09922334, -0.15847856, -0.0033982224, 0.020932984, 0.12874377, 0.048792202, 0.06521213, 0.12456798, 0.15958112, 0.15981804, 0.07657683, 0.1759313, 0.012727211, 0.120304115), target1);
	
	MF4 target2 = { -0.04928105, -0.003357327, -0.03886671, 0.076106146 };
	target2 = MulAdd(a1, MF4x4(0.003206617, 0.04896987, 0.049652386, 0.10869342, 0.36313584, -0.070666805, 0.93581825, -0.52484274, -0.14278883, 0.064016834, -0.05534331, 0.02961736, -0.1319316, 0.05740655, 0.2405951, -0.12313382), target2);
	target2 = MulAdd(b1, MF4x4(0.014092832, 0.07058761, -0.07887866, -0.27478936, -0.31456405, -0.31036922, -0.18380909, -0.11277979, -0.034889866, -0.37914017, -0.056245584, 0.24008954, -0.03414483, -0.023189066, -0.010568316, -0.004604883), target2);
	target2 = MulAdd(c1, MF4x4(0.15443979, -0.050161768, -0.012300917, -0.08834887, 0.082193285, 0.06878423, 0.1478042, -0.3774468, -0.18659878, 0.14238152, 0.033605397, 0.13560006, -0.032682173, -0.024561955, 0.05656941, -0.034246165), target2);
	target2 = MulAdd(d1, MF4x4(0.04691462, 0.064624496, -0.15950382, 0.16081297, -0.1417951, -0.109690994, -0.021205869, 0.19361454, -0.006306647, 0.3401972, -0.00014070333, 0.11619607, -0.13437814, 0.05464789, 0.37712076, -0.12470751), target2);
	target2 = MulAdd(e1, MF4x4(-0.40016884, 0.010666597, -0.005395378, 0.51084363, -0.009875391, 0.3969395, 0.47768033, -0.3392299, -0.1509509, -0.057620626, -0.1834601, -0.09998148, 0.10095897, -0.2213528, 0.02546703, -0.28506726), target2);
	target2 = MulAdd(f1, MF4x4(0.26652217, -0.106772706, -0.12609608, -0.0949661, -0.10869194, -0.55331933, -0.011515521, -0.27978876, -0.2124893, 0.03954004, 0.1691768, 0.05590268, 0.1539662, 0.10703386, -0.027286088, 0.2168544), target2);
	target2 = MulAdd(g1, MF4x4(-0.04862511, 0.06919758, -0.12962708, 0.016036907, -0.030030789, -0.20159967, 0.0013158675, -0.07799172, -0.032236706, -0.0035921712, -0.085437834, -0.025374755, -0.06251374, -0.009269627, -0.07519051, -0.01884611), target2);
	target2 = MulAdd(h1, MF4x4(0.23940067, -0.19496065, -0.05494683, 0.11601073, -0.074225076, 0.24976431, 0.41665986, 0.12029472, 0.16815041, -0.115868434, 0.06333614, 0.032145746, 0.15990137, -0.14886795, 0.034102913, -0.07727595), target2);
	target2 = MulAdd(i1, MF4x4(0.14702639, -0.013711502, 0.011437429, -0.11201445, -0.2582659, 0.34539905, 0.058082145, -0.18346462, 0.0027891365, 0.072565466, 0.12716974, 0.050636146, 0.092657596, 0.08541754, -0.1266164, 0.027881607), target2);
	target2 = MulAdd(a2, MF4x4(0.043362036, 0.020758621, 0.09906072, -0.22401148, -0.19104514, -0.25774476, 0.074128486, 0.08558291, -0.075419895, 0.20380639, 0.06398196, 0.015925938, 0.089786015, -0.100721814, -0.1374862, 0.26110905), target2);
	target2 = MulAdd(b2, MF4x4(-0.12547149, 0.08151811, -0.15953775, -0.33995447, -0.50784314, 0.46155545, 0.24986996, 0.03404644, -0.047789436, -0.12438347, -0.14143273, -0.17951359, -0.08057819, 0.023863006, -0.008539273, -0.06775414), target2);
	target2 = MulAdd(c2, MF4x4(0.1430169, 0.056971863, -0.021576611, -0.045342956, -0.22356391, -0.15344621, -0.0467977, -0.22970036, -0.0125351725, 0.16957329, -0.0069183917, -0.013949834, -0.048609708, 0.05261722, 0.023262242, 0.2123519), target2);
	target2 = MulAdd(d2, MF4x4(-0.019523792, 0.008228363, -0.04616012, -0.14341992, -0.19307113, 0.005937241, 0.24048887, -0.04279845, 0.022574252, 0.15558265, -0.035000063, 0.18318397, -0.05392528, -0.26044658, -0.13493988, 0.056433514), target2);
	target2 = MulAdd(e2, MF4x4(-0.28926027, -0.17381874, 0.07685766, -0.0061521684, -0.47455552, -0.49213487, 0.36924496, 0.29042044, 0.201094, -0.14280887, -0.4531411, -0.52902204, -0.28123, 0.1401882, 0.32054895, -0.11357518), target2);
	target2 = MulAdd(f2, MF4x4(0.14173324, -0.12069898, -0.07242415, 0.105665006, 0.017373435, -0.056042343, 0.07270201, 0.022111928, -0.01106541, 0.01666006, 0.013564169, -0.36628693, -0.25450787, -0.28179473, -0.04721874, -0.21912882), target2);
	target2 = MulAdd(g2, MF4x4(-0.09464695, -0.027919646, 0.13088459, 0.17504032, -0.101641014, 0.29687008, 0.08832321, 0.020538324, -0.15108941, -0.21930224, -0.026915176, -0.07078217, 0.10723033, 0.034364715, 0.18183397, -0.119012214), target2);
	target2 = MulAdd(h2, MF4x4(-0.21713468, -0.0846604, 0.046551514, -0.14989382, 0.08672032, -0.07933831, 0.08093595, -0.064147756, -0.15980323, 0.50000644, -0.091568656, 0.03201994, -0.1848647, -0.0646309, 0.03288009, 0.046442386), target2);
	target2 = MulAdd(i2, MF4x4(0.053532355, -0.054523747, -0.040242642, -0.31438905, 0.06452703, -0.18785381, -0.14987698, -0.067642935, -0.19892459, -0.057256676, 0.05943023, -0.17331842, 0.02588534, 0.13134238, -0.07121775, 0.23446162), target2);
	target2 = MulAdd(na1, MF4x4(0.20633182, 0.01686198, 0.17934167, -0.02063493, 0.042606052, -0.05289458, 0.031508356, 0.00082803797, 0.0756423, -0.047548845, 0.01456339, 0.15910533, -0.20119642, 0.029213727, 0.111036316, -0.047010012), target2);
	target2 = MulAdd(nb1, MF4x4(0.09258436, -0.27904224, -0.086695746, 0.33095327, -0.20126075, -0.050745636, -0.048944805, -0.10536587, -0.012995092, 0.07926994, 0.15071853, -0.13644052, -0.05188447, -0.06750699, -0.14227037, 0.028751127), target2);
	target2 = MulAdd(nc1, MF4x4(-0.18562223, 0.10250865, -0.17573993, 0.20434102, -0.05187468, -0.06441594, -0.052127104, -0.01925564, 0.02927959, -0.12711872, 0.059629507, 0.15696885, -0.010168965, 0.09971862, -0.03177664, -0.022744441), target2);
	target2 = MulAdd(nd1, MF4x4(0.21474063, -0.15679085, 0.09609374, 0.109079376, -0.049934637, -0.07393633, 0.16688468, -0.018888129, 0.04240162, -0.31895876, -0.106516436, 0.20008606, -0.054410245, 0.028970616, -0.18008347, -0.013362003), target2);
	target2 = MulAdd(ne1, MF4x4(0.37891293, 0.042730846, -0.24735828, -0.5234527, -0.3681344, -0.06609157, -0.14993733, -0.020316398, 0.123008475, 0.29632482, 0.32149333, 0.35999274, -0.18967044, 0.46154186, -0.016041815, 0.097378336), target2);
	target2 = MulAdd(nf1, MF4x4(-0.14873263, 0.07600569, -0.051758345, 0.1803135, -0.23121934, 0.13574593, 0.043973465, -0.13992754, -0.061972607, -0.124083005, -0.049196843, -0.07700431, 0.21572952, -0.25241727, 0.1218322, -0.07773728), target2);
	target2 = MulAdd(ng1, MF4x4(0.040287063, 0.024240922, 0.021917762, -0.050616946, -0.023174169, 0.05977014, 0.018892275, 0.04014965, 0.11715485, 0.062129, 0.024620812, 0.013617107, 0.075699426, 0.1858111, -0.11769179, -0.08085602), target2);
	target2 = MulAdd(nh1, MF4x4(-0.3194255, 0.08695645, -0.09453595, 0.2564516, 0.02192303, 0.08167247, -0.06257352, 0.043801844, 0.04392246, 0.2020571, 0.045180902, 0.18857521, 0.1835961, -0.043788187, -0.08768916, -0.14755538), target2);
	target2 = MulAdd(ni1, MF4x4(-0.22074097, 0.13768476, -0.16183749, 0.059949517, -0.011375954, 0.08581876, 0.004800447, 0.019403988, 0.014646056, 0.07363176, -0.058036458, 0.0706421, 0.08082624, 0.17740329, -0.05484784, 0.050796065), target2);
	target2 = MulAdd(na2, MF4x4(-0.032330472, -0.067666024, 0.18980837, -0.19077848, 0.1111905, 0.03855666, -0.11272314, -0.00577739, 0.17697452, -0.053044144, -0.07510145, 0.061853852, -0.024240626, 0.14846492, 0.14804313, -0.20275854), target2);
	target2 = MulAdd(nb2, MF4x4(0.17133904, -0.16356844, 0.1978664, 0.13877816, 0.28208038, 0.031539194, 0.11313891, -0.0014802719, 0.0033749861, 0.046372313, 0.054808807, -0.0024151779, 0.0068782056, -0.16414621, -0.07545907, -0.2521294), target2);
	target2 = MulAdd(nc2, MF4x4(-0.1746992, -0.037628956, -0.0044012754, -0.004390821, 0.0050341445, -0.112742625, 0.051241755, 0.01984483, 0.0003531837, 0.043500375, 0.030881992, 0.003503799, 0.13611782, -0.02509031, -0.007503557, -0.009321301), target2);
	target2 = MulAdd(nd2, MF4x4(0.087250136, 0.12374122, 0.2959519, 0.11314702, 0.22080182, 0.106726184, -0.29768205, 0.14931595, 0.23356548, -0.008709153, -0.0797829, 0.046940215, -0.07027616, 0.20533602, 0.0723021, -0.1963585), target2);
	target2 = MulAdd(ne2, MF4x4(0.00609982, 0.35277408, -0.22781096, -0.28912535, 0.42393112, -0.07654207, 0.12636793, 0.049337976, -0.0967726, -0.19349189, 0.36800626, 0.09745645, 0.47663373, 0.03876107, -0.042987954, 0.016161885), target2);
	target2 = MulAdd(nf2, MF4x4(-0.047490966, -0.05823166, 0.036158644, 0.025337253, -0.046618905, 0.108276576, -0.024148034, 0.0026794411, 0.1497962, -0.09328474, -0.03160641, 0.24351281, -0.05198027, 0.030720685, 0.00014528916, -0.2224931), target2);
	target2 = MulAdd(ng2, MF4x4(-0.007338369, 0.18710312, 0.14617369, -0.0070655346, 0.10464997, -0.029674934, -0.11842202, -0.09114357, 0.08524458, -0.08082762, 0.06479597, -0.023760766, 0.07523641, 0.0067315935, 0.101266846, -0.2780903), target2);
	target2 = MulAdd(nh2, MF4x4(0.14181875, -0.19523518, 0.1068169, -0.10284853, 0.11634046, -0.117397435, 0.09113022, 0.009371062, -0.022120507, -0.1127032, 0.092574745, -0.021989716, -0.088107705, -0.13541982, 0.08130504, -0.0678927), target2);
	target2 = MulAdd(ni2, MF4x4(0.09948295, 0.23699793, -0.042369924, 0.16744529, -0.10045506, -0.045623623, 0.04871897, -0.0023967526, 0.02602692, -0.089873284, -0.050681606, -0.09332558, -0.09596149, -0.06988313, 0.0007193808, -0.11936899), target2);

	conv2d_4_tf[gxy] = target1;
	conv2d_4_tf1[gxy] = target2;
}

//!PASS 6
//!DESC Conv-4x3x3x16
//!IN conv2d_4_tf, conv2d_4_tf1
//!OUT conv2d_5_tf, conv2d_5_tf1
//!BLOCK_SIZE 8
//!NUM_THREADS 64

void Pass6(uint2 blockStart, uint3 threadId) {
	uint2 gxy = Rmp8x8(threadId.x) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	float2 pos = (gxy + 0.5f) * inputPt;

	// [ a, d, g ]
	// [ b, e, h ]
	// [ c, f, i ]
	MF4 a1 = conv2d_4_tf.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0);
	MF4 b1 = conv2d_4_tf.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	MF4 c1 = conv2d_4_tf.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	MF4 d1 = conv2d_4_tf.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	MF4 e1 = conv2d_4_tf.SampleLevel(sam, pos, 0);
	MF4 f1 = conv2d_4_tf.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	MF4 g1 = conv2d_4_tf.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	MF4 h1 = conv2d_4_tf.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	MF4 i1 = conv2d_4_tf.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0);

	MF4 na1 = max(-a1, 0);
	MF4 nb1 = max(-b1, 0);
	MF4 nc1 = max(-c1, 0);
	MF4 nd1 = max(-d1, 0);
	MF4 ne1 = max(-e1, 0);
	MF4 nf1 = max(-f1, 0);
	MF4 ng1 = max(-g1, 0);
	MF4 nh1 = max(-h1, 0);
	MF4 ni1 = max(-i1, 0);

	a1 = max(a1, 0);
	b1 = max(b1, 0);
	c1 = max(c1, 0);
	d1 = max(d1, 0);
	e1 = max(e1, 0);
	f1 = max(f1, 0);
	g1 = max(g1, 0);
	h1 = max(h1, 0);
	i1 = max(i1, 0);

	MF4 a2 = conv2d_4_tf1.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0);
	MF4 b2 = conv2d_4_tf1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	MF4 c2 = conv2d_4_tf1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	MF4 d2 = conv2d_4_tf1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	MF4 e2 = conv2d_4_tf1.SampleLevel(sam, pos, 0);
	MF4 f2 = conv2d_4_tf1.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	MF4 g2 = conv2d_4_tf1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	MF4 h2 = conv2d_4_tf1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	MF4 i2 = conv2d_4_tf1.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0);

	MF4 na2 = max(-a2, 0);
	MF4 nb2 = max(-b2, 0);
	MF4 nc2 = max(-c2, 0);
	MF4 nd2 = max(-d2, 0);
	MF4 ne2 = max(-e2, 0);
	MF4 nf2 = max(-f2, 0);
	MF4 ng2 = max(-g2, 0);
	MF4 nh2 = max(-h2, 0);
	MF4 ni2 = max(-i2, 0);

	a2 = max(a2, 0);
	b2 = max(b2, 0);
	c2 = max(c2, 0);
	d2 = max(d2, 0);
	e2 = max(e2, 0);
	f2 = max(f2, 0);
	g2 = max(g2, 0);
	h2 = max(h2, 0);
	i2 = max(i2, 0);

	MF4 target1 = { 0.0095873345, 0.04959374, -0.15246227, 0.0044831373 };
	target1 = MulAdd(a1, MF4x4(-0.13425097, -0.23487093, 0.2480183, -0.2806276, -0.041303713, 0.100773126, -0.110890545, 0.036205858, -0.331331, -0.12929262, 0.16300063, 0.3776673, -0.20316373, -0.011239426, 0.10650887, -0.027857736), target1);
	target1 = MulAdd(b1, MF4x4(0.09517376, -0.3004956, 0.05033304, -0.07464521, 0.009204248, -0.23034886, 0.30492118, -0.1215848, 0.15728685, -0.10430078, 0.04038878, 0.08034804, 0.04320418, -0.2929594, -0.018968396, 0.02542387), target1);
	target1 = MulAdd(c1, MF4x4(-0.10651935, -0.2736715, 0.19267319, -0.033337504, -0.06697293, 0.028424729, 0.047814637, 0.44929537, 0.02565344, -0.253426, -0.040931404, -0.05018104, 0.032979824, -0.035349697, -0.039578713, -0.3116414), target1);
	target1 = MulAdd(d1, MF4x4(0.09176126, 0.031713437, 0.24861507, 0.31351718, 0.36284143, 0.3622709, 0.16165464, 0.07319267, -0.6303202, -0.21209712, -0.02169929, 0.037275597, -0.1295319, 0.033090707, -0.029330662, 0.054679472), target1);
	target1 = MulAdd(e1, MF4x4(0.15021572, -0.15177831, 0.1318225, 0.46864823, 0.059443284, 0.07404233, 0.22612074, 0.21105285, 0.319694, 0.09397257, 0.14277866, -0.0235649, -0.037205156, -0.40715128, -0.18572816, 0.058741573), target1);
	target1 = MulAdd(f1, MF4x4(-0.122751735, -0.20926422, 0.2099333, -0.11627138, 0.04171681, 0.0669586, -0.03831368, 0.27334675, 0.0492008, 0.12854317, 0.03308622, 0.45236585, 0.03122829, 0.13853219, 0.05084764, -0.3965012), target1);
	target1 = MulAdd(g1, MF4x4(-0.0019293908, -0.15562099, 0.12418126, 0.0045440597, 0.05442391, -0.15613738, 0.14828286, -0.17687118, -0.053517755, -0.33350968, -0.062314924, -0.31358472, -0.09670371, 0.043190923, 0.008150662, 0.09928506), target1);
	target1 = MulAdd(h1, MF4x4(-0.06698031, -0.099411525, 0.24259582, -0.1073659, 0.06762824, 0.059605874, -0.20944163, -0.1598055, 0.32746908, -0.17759447, 0.2859796, -0.1274256, 0.30796206, -0.00791448, 0.114059694, 0.14775705), target1);
	target1 = MulAdd(i1, MF4x4(0.16291203, -0.14958477, 0.14716864, 0.2056065, -0.019337546, 0.032286238, 0.0030445335, -0.08208513, -0.14208078, 0.13601872, -0.23367858, -0.19092909, -0.20207883, -0.016950991, 0.009309007, 0.1376546), target1);
	target1 = MulAdd(a2, MF4x4(-0.11093091, -0.32362202, -0.041845415, 0.029758021, -0.07261404, -0.048653398, 0.19167832, 0.09343212, 0.030472826, -0.15078579, -0.0056376588, 0.0045257527, -0.24521805, -0.10473077, 0.11163019, -0.1724187), target1);
	target1 = MulAdd(b2, MF4x4(-0.08601668, 0.16612485, -0.07751539, 0.07261594, -0.19028407, 0.23896623, -0.10416726, 0.23500614, 0.1955228, 0.08699591, -0.049277775, 0.13447775, 0.19434914, -0.11481196, 0.088043146, 0.13352895), target1);
	target1 = MulAdd(c2, MF4x4(-0.013221233, 0.07521129, 0.042819552, -0.11163175, 0.066080205, -0.25043094, -0.010348969, -0.013258202, 0.09444396, 0.29623637, 0.025016114, 0.050744686, -0.12219596, -0.0735393, -0.024817836, -0.06897588), target1);
	target1 = MulAdd(d2, MF4x4(-0.25720942, 0.19861753, -0.18535058, 0.12190362, -0.33756095, -0.0038898317, 0.09739055, 0.41227046, -0.10030796, 0.025445882, -0.23542109, 0.08677691, 0.08140194, -0.22716106, 0.14016968, -0.0927231), target1);
	target1 = MulAdd(e2, MF4x4(0.58745646, -0.12533307, 0.30129984, 0.08898194, -0.07972344, -0.37581098, 0.06863413, -0.13185541, 0.21801205, 0.31779078, -0.3804784, -0.3200699, 0.14534226, 0.05912262, 0.07938948, -0.34869507), target1);
	target1 = MulAdd(f2, MF4x4(0.024675166, -0.067802526, 0.030065436, 0.06509131, 0.14367498, 0.022554757, 0.014991865, -0.029914752, 0.5123549, -0.012557206, -0.13014166, -0.34184244, -0.09080884, 0.13782553, -0.018931886, -0.35642785), target1);
	target1 = MulAdd(g2, MF4x4(-0.37336427, -0.02705006, 0.14392053, 0.024049882, -0.024705589, 0.14556128, -0.12120506, -0.06275598, -0.1284325, 0.11409197, -0.08397436, -0.075944416, 0.056465942, 0.04016099, 0.096723564, -0.08359723), target1);
	target1 = MulAdd(h2, MF4x4(0.20243345, -0.09287934, -0.11676041, 0.005206654, -0.2879361, 0.41677123, -0.16924824, 0.22429213, 0.082279116, -0.1780833, 0.20209241, 0.12970525, -0.030272234, -0.19200714, 0.0015769673, -0.1389732), target1);
	target1 = MulAdd(i2, MF4x4(0.04211243, 0.07331798, -0.055724114, 0.04086206, -0.04635456, 0.027212424, 0.021861525, 0.12424812, 0.43009162, 0.021664696, 0.20828371, 0.11859106, 0.07390811, -0.1861182, 0.034559406, 0.18561925), target1);
	target1 = MulAdd(na1, MF4x4(0.22596797, 0.025346763, -0.056839246, 0.09137385, 0.07363095, -0.12382036, 0.08911783, -0.012355983, -0.07869761, 0.051298574, 0.00816572, -0.044984274, 0.07962154, -0.2254524, -0.007821531, -0.04936664), target1);
	target1 = MulAdd(nb1, MF4x4(0.06265961, -0.17783198, 0.11678783, -0.12965304, 0.014506855, -0.17513473, -0.23593299, 0.14054537, 0.1580306, 0.31872272, -0.0042505316, -0.070422255, -0.01316396, 0.0058355615, 0.062464185, -0.06086727), target1);
	target1 = MulAdd(nc1, MF4x4(-0.079526044, 0.23932967, -0.1139716, 0.15888569, 0.06526993, -0.06958436, -0.04070066, -0.12081254, 0.026716579, 0.014887845, 0.0061467467, 0.127956, 0.040913627, -0.0032820841, 0.086145625, 0.22520025), target1);
	target1 = MulAdd(nd1, MF4x4(0.25577608, 0.02553098, -0.14822578, -0.11907723, -0.09787419, -0.03544863, -0.08098151, -0.01305555, 0.20404844, 0.11294246, 0.10096346, 0.15795162, 0.2554626, 0.09361069, 0.001985862, -0.0051444587), target1);
	target1 = MulAdd(ne1, MF4x4(-0.24454486, -0.014714279, -0.2954907, -0.39995646, -0.15907967, 0.30107877, -0.34781745, 0.095281735, -0.12492393, -0.28375402, -0.16872306, 0.2531788, -0.52085644, 0.35986066, 0.07716912, 0.09565738), target1);
	target1 = MulAdd(nf1, MF4x4(0.2493129, 0.06395661, -0.09491958, 0.19702488, 0.109871864, -0.051376317, 0.15404263, -0.21282886, 0.1188967, 0.07824094, -0.016752928, -0.14027214, 0.10949832, -0.27629098, 0.081909016, 0.1354018), target1);
	target1 = MulAdd(ng1, MF4x4(0.18950915, -0.034574565, -0.10378051, -0.15800652, -0.06835184, -0.06987467, 0.035007782, 0.04686656, 0.054061133, 0.014833506, -0.0035361175, 0.016156103, 0.120767444, -0.10196722, 0.10668838, -0.09058739), target1);
	target1 = MulAdd(nh1, MF4x4(-0.032248627, 0.056413256, 0.042716432, 0.06681831, 0.047605485, -0.07629479, 0.14311917, -0.06909803, 0.10640394, 0.10701861, -0.0051839007, -0.15133362, -0.32146424, -0.039978918, -0.12280021, 0.0048507582), target1);
	target1 = MulAdd(ni1, MF4x4(-0.1954503, -0.09257865, 0.11023244, -0.01817947, -0.0035485283, -0.015536726, 0.0071826433, 0.042538714, -0.015454641, 0.079593316, -0.07242554, 0.031178504, 0.2319168, -0.10519467, 0.013837495, -0.040088437), target1);
	target1 = MulAdd(na2, MF4x4(0.12625901, 0.04531166, 0.038758352, -0.05790713, -0.10029771, -0.118265375, -0.23944628, 0.11955388, 0.070732996, 0.19404806, -0.019913414, 0.04609079, 0.06262817, 0.022330387, -0.029681094, 0.03719176), target1);
	target1 = MulAdd(nb2, MF4x4(-0.07737922, 0.0024623116, -0.037666153, -0.19271135, -0.015002153, -0.0059966356, 0.0024538909, -0.0401021, -0.18540399, -0.11140236, -0.11102473, -0.06390247, 0.016754225, 0.35000673, -0.19139731, 0.07363001), target1);
	target1 = MulAdd(nc2, MF4x4(0.02150171, -0.2311761, -0.025124706, 0.16819553, -0.0013348719, 0.32091036, -0.061826598, 0.12579474, -0.036611024, -0.018266583, -0.11280143, 0.11073158, 0.050171874, -0.14706045, 0.029553955, 0.0052631944), target1);
	target1 = MulAdd(nd2, MF4x4(0.19249865, -0.22854832, 0.09472751, 0.014705341, 0.059496958, 0.13427268, -0.06309558, -0.07153743, -0.31890163, -0.0657967, -0.040345218, 0.09544393, 0.07359761, 0.11245483, 0.00033233972, 0.031550154), target1);
	target1 = MulAdd(ne2, MF4x4(-0.24668917, -0.37181908, -0.50614715, -0.101197146, -0.1569055, 0.27734125, 0.17144768, -0.04336267, 0.03658949, 0.06747124, 0.30720958, 0.56301194, -0.11314631, -0.29258573, 0.16256689, 0.5221001), target1);
	target1 = MulAdd(nf2, MF4x4(-0.022761503, 0.13063031, 0.002526217, -0.03466151, -0.15225072, 0.40217137, -0.089131154, 0.19195192, -0.1379853, -0.04640692, 0.104670234, 0.12268618, -0.012009209, -0.20534724, 0.028777445, 0.22195113), target1);
	target1 = MulAdd(ng2, MF4x4(0.23697586, 0.08793654, -0.10565018, 0.013993297, -0.025932996, -0.13859354, 0.14333159, -0.099132575, -0.049601994, -0.0917448, -0.0021633878, -0.009032609, -0.034750953, -0.30761167, 0.058994945, -0.19427797), target1);
	target1 = MulAdd(nh2, MF4x4(-0.26944515, 0.30523893, -0.17787015, 0.10827742, 0.06457236, -0.12202401, 0.15371302, 0.011699893, -0.06253491, -0.10976804, -0.37283847, -0.23996784, -0.2750512, -0.024101513, -0.094127975, -0.17462716), target1);
	target1 = MulAdd(ni2, MF4x4(-0.026286924, 0.06250577, 0.095423855, -0.02849258, -0.12916361, -0.10954709, -0.05825132, -0.102924265, -0.19550376, -0.11730307, 0.032346163, -0.17682706, 0.16651174, 0.031927045, -0.004800601, -0.06323844), target1);
	
	MF4 target2 = { -0.001935585, 0.05018077, -0.0154469935, -0.034524206 };
	target2 = MulAdd(a1, MF4x4(-0.021453971, -0.108874515, 0.0005208881, -0.09774453, -0.0053757126, 0.20114918, 0.24454592, 0.04932251, -0.0037210248, -0.0240578, -0.07736935, 0.27604944, -0.12430849, -0.13093218, -0.014840212, 0.13450128), target2);
	target2 = MulAdd(b1, MF4x4(-0.19143668, -0.23023333, -0.10232715, 0.24396868, 0.056112397, 0.14535592, -0.25882182, -0.26274678, -0.23119931, 0.07735849, -0.14785223, -0.21026523, -0.2064457, -0.34512606, -0.17808662, 0.30146623), target2);
	target2 = MulAdd(c1, MF4x4(0.0072161015, -0.013303738, 0.07591899, 0.027883789, 0.210858, 0.1422139, -0.027882019, 0.2618474, -0.048504543, 0.07377317, -0.05427271, -0.10014041, -0.12974857, -0.13140713, -0.02249253, 0.08203184), target2);
	target2 = MulAdd(d1, MF4x4(0.07855138, -0.13984342, 0.10037151, -0.056781758, 0.24686107, -0.0048190085, -0.2693424, 0.31722167, -0.28716075, -0.06422215, -0.06738793, -0.06723655, -0.08194382, -0.007975044, 0.20108353, -0.13338897), target2);
	target2 = MulAdd(e1, MF4x4(0.35129568, 0.27930936, 0.024239251, -0.10712293, 0.48684034, -0.04380574, -0.0064479653, 0.03754327, -0.13139078, -0.44939983, -1.0460628, -0.016004754, -0.14476573, -0.07113434, 0.515311, -0.400374), target2);
	target2 = MulAdd(f1, MF4x4(0.13104302, -0.23410062, 0.091530964, -0.003652217, 0.16696814, 0.16406855, -0.08138474, 0.047526445, 0.25358474, 0.37850454, 0.0362802, -0.046476766, -0.093869686, -0.4143772, 0.08641024, 0.115896136), target2);
	target2 = MulAdd(g1, MF4x4(-0.04416574, -0.052188106, 0.05141859, -0.008132604, -0.013658864, 0.1021097, 0.19391364, -0.09257973, 0.15225394, -0.16920799, -0.16172324, 0.41466942, -0.07087308, 0.08632938, -0.07496043, -0.023530172), target2);
	target2 = MulAdd(h1, MF4x4(0.09337352, 0.062108494, -0.219173, -0.046151914, 0.22507025, -0.08966131, -0.123690315, 0.08666376, -0.10731867, -0.08518657, 0.024199447, 0.17898631, 0.120247275, 0.089923285, -0.08756211, 0.1775775), target2);
	target2 = MulAdd(i1, MF4x4(0.20326594, -0.060535498, -0.061659336, 0.113954924, -0.073462196, 0.15917051, 0.11728326, -0.072256014, -0.0752342, 0.06265616, -0.19494365, -0.25413772, -0.06641352, -0.015642308, 0.16825356, 0.0027654327), target2);
	target2 = MulAdd(a2, MF4x4(-0.17029639, -0.05388927, -0.13159063, 0.0795609, 0.00501164, -0.0703107, -0.08229201, 0.07546247, 0.092942156, 0.059050936, -0.07987315, 0.010874322, 0.037708692, -0.0017377702, -0.030414931, 0.28946167), target2);
	target2 = MulAdd(b2, MF4x4(-0.2692667, 0.2258295, 0.062060453, 0.1934921, -0.023051793, -0.038611185, 0.21473692, 0.33520013, 0.029885106, 0.103782356, 0.05217351, -0.13349791, -0.034186684, -0.3015818, 0.033423528, 0.21218027), target2);
	target2 = MulAdd(c2, MF4x4(-0.013587494, 0.021273775, -0.022650799, -0.011939531, -0.11202949, 0.09365859, -0.042938907, -0.009910716, 0.27254924, 0.07752608, 0.029586637, 0.024899973, 0.04375618, 0.31453863, -0.006775175, 0.008228053), target2);
	target2 = MulAdd(d2, MF4x4(-0.49562672, -0.12472124, -0.13618441, 0.09660054, -0.2275429, -0.0902811, 0.18311924, 0.11677185, -0.13325182, -0.061613016, -0.011462703, -0.12538978, 0.054934092, 0.06742866, 0.25515345, 0.35692096), target2);
	target2 = MulAdd(e2, MF4x4(0.5266911, -0.09655596, -0.41069564, -0.3174325, 0.1431904, -0.17732115, -0.36320353, 0.37975433, -0.5158582, -0.21019879, 0.06852925, -0.06648648, -0.18956456, -0.018139647, 0.35707653, 0.07378416), target2);
	target2 = MulAdd(f2, MF4x4(0.04151976, -0.037361674, 0.06936584, -0.10462262, -0.22264048, -0.043842267, -0.12742832, -0.21778631, 0.0715335, -0.17921853, -0.3856251, -0.16335362, 0.21045755, -0.5026229, 0.14405337, 0.23096423), target2);
	target2 = MulAdd(g2, MF4x4(-0.32437655, 0.07860345, -0.0021187086, 0.123870686, -0.16616751, 0.11004699, 0.04754715, -0.0075211064, -0.08026408, 0.04284957, -0.018143758, 0.032623176, 0.06614686, -0.035856936, 0.13667971, -0.15696613), target2);
	target2 = MulAdd(h2, MF4x4(0.11260625, 0.03274457, -0.033769324, -0.11558525, -0.35377702, 0.0019119612, 0.24906515, -0.06853208, 0.0009843144, -0.0050376705, 0.063123666, 0.009872904, 0.19592324, 0.0028321196, -0.114693984, 0.16404222), target2);
	target2 = MulAdd(i2, MF4x4(-0.03699667, 0.011842293, -0.12273219, 0.04081692, 0.008484447, -0.052331816, 0.07151068, 0.018538639, 0.077749036, 0.07189092, 0.22443593, -0.2436085, 0.023654116, -0.05127411, 0.27350748, 0.12180999), target2);
	target2 = MulAdd(na1, MF4x4(0.16090482, 0.059198547, 0.04856637, -0.19173436, 0.12747662, -0.079715036, -0.20203276, -0.13818277, -0.123076215, -0.07168488, 0.0644838, 0.03524764, 0.0005124138, -0.06789178, 0.048645556, -0.098922126), target2);
	target2 = MulAdd(nb1, MF4x4(0.29220074, 0.25197285, 0.09825887, 0.030363245, -0.033246458, -0.08370418, -0.12231589, -0.023000835, 0.082732, -0.16907515, -0.052518822, 0.07991363, 0.06222654, -0.06747275, -0.18931144, -0.42009747), target2);
	target2 = MulAdd(nc1, MF4x4(0.02667354, 0.03842717, -0.012755562, 0.061840586, 0.01060547, -0.29081437, 0.010907111, 0.07930905, 0.12273201, 0.017574295, 0.051024225, 0.019036688, 0.07671181, 0.049130872, -0.09734168, -0.070569195), target2);
	target2 = MulAdd(nd1, MF4x4(0.08517651, 0.0767222, -0.15657257, 0.18501835, -0.13749431, -0.2833894, 0.109219365, 0.033763003, 0.18988928, 0.13461404, -0.036578514, -0.13256857, -0.097819485, -0.17316358, -0.06512401, 0.1937444), target2);
	target2 = MulAdd(ne1, MF4x4(-0.32173568, -0.072075866, 0.13004705, -0.15507852, -0.23741087, -0.29364398, 0.10723945, -0.11976219, 0.20620506, 0.17970093, 0.24463713, -0.12555319, -0.021192182, -0.1374317, 0.5359718, 0.59974134), target2);
	target2 = MulAdd(nf1, MF4x4(-0.01101575, 0.040466793, -0.009630791, 0.13422947, -0.13290837, -0.24789505, -0.061713737, -0.07838521, 0.05379315, -0.14643523, -0.09155805, -0.049997047, 0.06696885, 0.20043123, -0.07542329, -0.08041673), target2);
	target2 = MulAdd(ng1, MF4x4(0.022160506, 0.01611432, -0.10189221, -0.022767285, -0.06682965, 0.047138248, 0.06860934, -0.012574086, 0.04010214, -0.041280016, -0.034621384, -0.018262599, 0.09731754, -0.059062295, 0.14786182, -0.15185094), target2);
	target2 = MulAdd(nh1, MF4x4(-0.052484483, 0.06899427, 0.18380043, -0.058414727, 0.07685985, -0.07206598, -0.101362616, -0.012002652, 0.008517392, 0.079471916, -0.30394664, 0.028600946, -0.03270232, -0.23564856, 0.045065008, -0.0034684737), target2);
	target2 = MulAdd(ni1, MF4x4(-0.049757, 0.07614825, 0.16394803, 0.027053174, 0.0451278, -0.09351286, -0.0042182617, 0.12332257, -0.025281021, -0.03843008, 0.12857373, -0.07611989, -0.0062898803, 0.022618141, -0.13122174, -0.03328411), target2);
	target2 = MulAdd(na2, MF4x4(0.12251631, 0.047008447, 0.027589995, -0.12207328, -0.1510795, 0.06724553, 0.17066906, 0.16992114, -0.0026905634, -0.035480864, 0.033738773, 0.018674552, 0.028614907, -0.019945908, -0.0156899, -0.09562145), target2);
	target2 = MulAdd(nb2, MF4x4(0.116588935, 0.14205505, 0.099545434, -0.045527786, -0.049273346, 0.20760757, 0.053965498, -0.12198069, -0.14654607, 0.041820496, 0.038068503, 0.24565905, 0.09786504, 0.18309233, 0.23802327, -0.085740186), target2);
	target2 = MulAdd(nc2, MF4x4(-0.1262052, -0.011846116, -0.058820397, -0.019373653, -0.09569547, -0.08265971, -0.05178388, -0.020502446, -0.17525336, -0.22874829, 0.0075891856, -0.189923, 0.09809122, 0.109637566, -0.0005973885, -0.06477921), target2);
	target2 = MulAdd(nd2, MF4x4(0.28209856, 0.11276813, 0.054377034, -0.00891202, -0.095922634, 0.071109876, -0.039932176, -0.047409832, -0.06504704, 0.11923986, 0.0013364811, -0.122095086, -0.20282102, -0.022717483, -0.115474045, 0.020858249), target2);
	target2 = MulAdd(ne2, MF4x4(-0.16130303, 0.072821185, -0.021358958, -0.11687897, -0.15543966, 0.05783285, 0.10317231, -0.12240756, 0.053357504, -0.090291016, -0.21943556, 0.46947235, 0.19072579, 0.017349033, -0.55443907, -0.10510661), target2);
	target2 = MulAdd(nf2, MF4x4(-0.4155687, 0.019206723, -0.20055711, 0.028732464, -0.1981807, 0.20637372, 0.03305817, -0.17949893, -0.21051097, 0.21483344, 0.0061496794, -0.48980987, -0.26750582, 0.09230394, -0.117223755, -0.07636286), target2);
	target2 = MulAdd(ng2, MF4x4(0.20611528, -0.00095511036, -0.21555157, -0.07065484, 0.06880411, 0.068082534, -0.10104979, 0.16050354, -0.07437897, -0.13145325, -0.017651044, 0.055096775, -0.05443345, -0.018634815, -0.011232755, -0.10835), target2);
	target2 = MulAdd(nh2, MF4x4(-0.2637829, 0.07681072, 0.015995527, 0.004554211, 0.07495561, 0.18873464, -0.14303622, 0.25786543, -0.14117226, -0.008715274, -0.17176823, -0.0006595096, -0.06566383, -0.19184378, -0.18945406, 0.20968987), target2);
	target2 = MulAdd(ni2, MF4x4(-0.03293623, 0.003399063, 0.08051177, -0.0072856937, -0.07375858, 0.075319655, -0.10791501, -0.002204552, -0.093564905, -0.122712255, -0.10658267, -0.015067637, -0.033247817, 0.09952069, -0.13724248, 0.068189256), target2);

	conv2d_5_tf[gxy] = target1;
	conv2d_5_tf1[gxy] = target2;
}

//!PASS 7
//!DESC Conv-4x3x3x16
//!IN conv2d_5_tf, conv2d_5_tf1
//!OUT conv2d_6_tf, conv2d_6_tf1
//!BLOCK_SIZE 8
//!NUM_THREADS 64

void Pass7(uint2 blockStart, uint3 threadId) {
	uint2 gxy = Rmp8x8(threadId.x) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	float2 pos = (gxy + 0.5f) * inputPt;

	// [ a, d, g ]
	// [ b, e, h ]
	// [ c, f, i ]
	MF4 a1 = conv2d_5_tf.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0);
	MF4 b1 = conv2d_5_tf.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	MF4 c1 = conv2d_5_tf.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	MF4 d1 = conv2d_5_tf.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	MF4 e1 = conv2d_5_tf.SampleLevel(sam, pos, 0);
	MF4 f1 = conv2d_5_tf.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	MF4 g1 = conv2d_5_tf.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	MF4 h1 = conv2d_5_tf.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	MF4 i1 = conv2d_5_tf.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0);

	MF4 na1 = max(-a1, 0);
	MF4 nb1 = max(-b1, 0);
	MF4 nc1 = max(-c1, 0);
	MF4 nd1 = max(-d1, 0);
	MF4 ne1 = max(-e1, 0);
	MF4 nf1 = max(-f1, 0);
	MF4 ng1 = max(-g1, 0);
	MF4 nh1 = max(-h1, 0);
	MF4 ni1 = max(-i1, 0);

	a1 = max(a1, 0);
	b1 = max(b1, 0);
	c1 = max(c1, 0);
	d1 = max(d1, 0);
	e1 = max(e1, 0);
	f1 = max(f1, 0);
	g1 = max(g1, 0);
	h1 = max(h1, 0);
	i1 = max(i1, 0);

	MF4 a2 = conv2d_5_tf1.SampleLevel(sam, pos + float2(-inputPt.x, -inputPt.y), 0);
	MF4 b2 = conv2d_5_tf1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	MF4 c2 = conv2d_5_tf1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	MF4 d2 = conv2d_5_tf1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	MF4 e2 = conv2d_5_tf1.SampleLevel(sam, pos, 0);
	MF4 f2 = conv2d_5_tf1.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	MF4 g2 = conv2d_5_tf1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	MF4 h2 = conv2d_5_tf1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	MF4 i2 = conv2d_5_tf1.SampleLevel(sam, pos + float2(inputPt.x, inputPt.y), 0);

	MF4 na2 = max(-a2, 0);
	MF4 nb2 = max(-b2, 0);
	MF4 nc2 = max(-c2, 0);
	MF4 nd2 = max(-d2, 0);
	MF4 ne2 = max(-e2, 0);
	MF4 nf2 = max(-f2, 0);
	MF4 ng2 = max(-g2, 0);
	MF4 nh2 = max(-h2, 0);
	MF4 ni2 = max(-i2, 0);

	a2 = max(a2, 0);
	b2 = max(b2, 0);
	c2 = max(c2, 0);
	d2 = max(d2, 0);
	e2 = max(e2, 0);
	f2 = max(f2, 0);
	g2 = max(g2, 0);
	h2 = max(h2, 0);
	i2 = max(i2, 0);

	MF4 target1 = { -0.036374483, 0.029420665, 0.04437756, -0.04474691 };
	target1 = MulAdd(a1, MF4x4(0.0053346683, 0.010174534, -0.050979972, -0.06134544, -0.007238652, -0.012790015, 0.036398683, -0.09181499, 0.11328388, -0.14236617, -0.17519625, -0.34661606, 0.008069393, -0.028871074, -0.02980949, -0.0853359), target1);
	target1 = MulAdd(b1, MF4x4(-0.05187267, -0.09381704, 0.035209883, 0.29482442, -0.0018002815, -0.029504262, 0.2609028, -0.09480671, -0.0737553, -0.070559524, 0.081991084, 0.1513024, 0.048344653, -0.09336617, 0.0034569732, 0.10530542), target1);
	target1 = MulAdd(c1, MF4x4(-0.06749591, 0.0065624053, 0.013237342, 0.14225115, 0.27183163, -0.15656447, 0.031672053, 0.009592649, -0.0202286, 0.26220062, 0.19387855, -0.18505628, 0.040554795, 0.07295961, -0.13291295, -0.12600344), target1);
	target1 = MulAdd(d1, MF4x4(0.039192002, 0.0846215, -0.06593224, 0.28147796, 0.06301313, 0.26323164, -0.16742979, 0.22004774, -0.17470881, 0.060716614, 0.15430811, 0.18970133, 0.08858931, -0.027321626, -0.037833836, 0.07344837), target1);
	target1 = MulAdd(e1, MF4x4(0.0633813, 0.35046157, -0.101075254, 0.015974075, 0.19010352, -0.7135035, -0.24324696, -0.42072615, 0.06825536, -0.052808974, 0.28965715, -0.0015640302, -0.27062586, 0.4279925, 0.035623744, 0.46321228), target1);
	target1 = MulAdd(f1, MF4x4(0.02639867, 0.26469797, -0.09086266, 0.07440796, -0.192054, 0.1010368, -0.04398074, 0.056824226, -0.27057743, -0.20455118, 0.19338831, -0.21843775, 0.20736177, -0.26259273, -0.07667085, -0.19504389), target1);
	target1 = MulAdd(g1, MF4x4(-0.007056104, 0.04284205, 0.01933733, 0.07267832, 0.0012616975, -0.30140647, -0.019223223, -0.046687007, -0.037844718, -0.014929125, 0.022630794, 0.046716493, 0.057279173, -0.08055539, -0.027891241, 0.019557232), target1);
	target1 = MulAdd(h1, MF4x4(0.035518404, -0.10087327, 0.0011048123, -0.123707846, 0.37190285, 0.43751532, -0.022599256, -0.041709043, 0.11357196, -0.029839104, -0.056960747, -0.17228557, 0.08558022, 0.046361133, 0.021548864, 0.24297418), target1);
	target1 = MulAdd(i1, MF4x4(-0.043598346, -0.09812348, 0.056599542, -0.09833163, -0.07193007, 0.015760094, -0.053177495, -0.015448543, 0.035163186, -0.03889347, 0.121799015, 0.15738566, -0.115644835, 0.043310717, 0.060173217, -0.059635755), target1);
	target1 = MulAdd(a2, MF4x4(-0.111604795, 0.1678389, 0.049967546, 0.045353863, -0.013896185, 0.035128903, 0.040686198, -0.16442506, 0.1149577, -0.14343217, -0.08858, 0.02656137, 0.059526477, -0.13914491, 0.12757027, 0.034920372), target1);
	target1 = MulAdd(b2, MF4x4(0.15849945, 0.12067003, -0.1579611, 0.30790725, -0.041249942, 0.03948043, -0.12535375, -0.02566875, 0.3150059, 0.027081972, -0.026308673, -0.25326517, 0.016824603, -0.13551097, 0.1412756, 0.037750524), target1);
	target1 = MulAdd(c2, MF4x4(0.1562541, -0.041948073, -0.14951487, 0.119380556, -0.21773878, -0.019281754, 0.08185942, 0.09982689, 0.017187534, -0.18181366, -0.09270861, 0.08527679, 0.051988564, 0.08686172, -0.12665209, -0.07205808), target1);
	target1 = MulAdd(d2, MF4x4(0.08860466, -0.17931758, 0.10191625, -0.47623265, 0.1562338, -0.2960855, 0.013664795, 0.29452285, 0.1463958, 0.17562817, -0.41623253, -0.196999, -0.049113005, 0.0556021, 0.054452494, 0.14073615), target1);
	target1 = MulAdd(e2, MF4x4(-0.5345973, -0.069205046, 0.37001884, 0.6955835, 0.22635284, -0.09021557, -0.04693607, -0.4458824, 0.25049326, -0.06503396, 0.07584689, 0.5394811, 0.33387923, -0.010540017, 0.038980547, -0.13371105), target1);
	target1 = MulAdd(f2, MF4x4(-0.04414677, -0.22056313, 0.05580458, 0.11914465, 0.19864987, -0.1025625, -0.010050287, 0.15919746, -0.40589634, 0.4966349, -0.47632688, -0.022637444, 0.17247641, -0.51093113, 0.21157944, -0.2890017), target1);
	target1 = MulAdd(g2, MF4x4(-0.034673482, -0.0075900992, -0.061077584, -0.03859898, 0.32444152, -0.14619137, -0.1375446, -0.030322462, 0.029679669, 0.079344586, -0.03862258, -0.05766807, 0.104488336, 0.006179548, -0.1168102, 0.069729604), target1);
	target1 = MulAdd(h2, MF4x4(0.08504003, 0.042162962, -0.17509954, -0.06258286, -0.45796555, -0.061748773, 0.25438437, -0.02988987, -0.06897794, 0.105180845, -0.08879189, -0.120605074, -0.1478659, -0.13201937, -0.01755498, 0.020606143), target1);
	target1 = MulAdd(i2, MF4x4(0.08932581, 0.1453785, -0.12802933, 0.049442187, 0.045360584, 0.16079827, -0.14142223, 0.10168491, 0.20244479, -0.17981426, 0.19759466, 0.05217847, 0.04889828, 0.06941533, -0.111836776, -0.08046399), target1);
	target1 = MulAdd(na1, MF4x4(-0.011953735, 0.11362504, -0.122588776, -0.10408559, 0.051712614, -0.05161036, -0.068698496, -0.015663281, -0.06346889, 0.06561636, 0.03783044, 0.02756004, -0.036310352, -0.16962235, -0.062494226, 0.0069608325), target1);
	target1 = MulAdd(nb1, MF4x4(-0.16857432, -0.17322211, 0.15971284, 0.19980437, -0.007965961, -0.015480705, 0.036090557, 0.07414387, -0.2941106, -0.24430539, 0.01070864, 0.22401866, -0.34321144, 0.09537491, -0.08020218, 0.45404655), target1);
	target1 = MulAdd(nc1, MF4x4(-0.021609096, -0.11348408, -0.01450652, -0.063170746, 0.06990935, -0.035983834, -0.038010992, -0.10578655, 0.29232737, 0.048835874, 0.054028947, -0.12924139, -0.03058583, 0.028469706, 0.09563202, 0.085674495), target1);
	target1 = MulAdd(nd1, MF4x4(-0.01894022, 0.037628658, -0.102314636, -0.28041583, 0.07495663, -0.058895253, 0.16422969, -0.07163792, 0.039416216, -0.13800906, -0.039811566, -0.10612402, -0.047593113, -0.28491783, 0.41632858, 0.15253194), target1);
	target1 = MulAdd(ne1, MF4x4(0.26240867, -0.05335849, -0.014135048, 0.055749495, -0.020126658, 0.2952794, -0.015241771, 0.36143306, 0.43075684, 0.1921996, -0.4329065, 0.5114495, 0.7326109, -0.054901246, -0.076693356, -0.26104695), target1);
	target1 = MulAdd(nf1, MF4x4(0.14548428, 0.14578429, 0.17193514, -0.07973242, 0.011952286, -0.047767498, 0.025101405, 0.0016503566, -0.26948047, -0.16503395, -0.061791085, 0.030557185, 0.15400517, -0.054951698, -0.14611247, 0.3550633), target1);
	target1 = MulAdd(ng1, MF4x4(-0.05926111, -0.083442695, 0.046579204, -0.017723244, 0.12846185, 0.018434443, -0.17914511, -0.077696435, 0.060048338, 0.02956987, -0.11914462, 0.057770032, -0.054673657, -0.005353606, -0.39014184, 0.08306877), target1);
	target1 = MulAdd(nh1, MF4x4(0.07357362, 0.23051825, -0.22640751, 0.080715515, -0.14467078, 0.009734264, 0.054320686, 0.24534328, -0.16038458, 0.06575425, 0.058553413, 0.17755087, 0.08184439, 0.17078212, 0.148369, -0.09309279), target1);
	target1 = MulAdd(ni1, MF4x4(-0.11160211, -0.07590204, -0.01676188, -0.062253337, 0.016433533, 0.0146132, -0.040350936, 0.06749202, -0.031521842, 0.1441664, -0.09916073, 0.050578352, -0.06560962, -0.31174552, 0.056873083, -0.077912), target1);
	target1 = MulAdd(na2, MF4x4(0.09344025, 0.075936995, -0.1627903, -0.04781558, -0.01878236, 0.045879602, -0.11507387, -0.025356822, -0.09113391, 0.07263937, 0.08232447, 0.08727616, -0.024921807, 0.051639438, 0.006532631, -0.018751068), target1);
	target1 = MulAdd(nb2, MF4x4(0.022455849, -0.12924309, 0.26318657, -0.32464805, -0.09627585, 0.04496843, -0.09630052, -0.025761643, -0.090804085, 0.24410398, -0.03162944, -0.1961483, 0.14065808, -0.064709485, -0.0040163463, 0.05445074), target1);
	target1 = MulAdd(nc2, MF4x4(-0.020935195, -0.1028065, 0.0012804621, 0.02302866, -0.00924972, -0.0041193594, 0.0060590385, -0.003394384, -0.23241943, -0.023235107, 0.08077456, 0.15720141, 0.06568382, -0.09971436, 0.09056065, 0.04271102), target1);
	target1 = MulAdd(nd2, MF4x4(-0.20997737, -0.12892777, 0.4658528, 0.13622813, -0.2867294, -0.09359254, 0.18821026, 0.25550604, -0.18562363, 0.080713026, 0.13463654, 0.045504905, -0.013133853, -0.1316404, 0.08379897, -0.00047142128), target1);
	target1 = MulAdd(ne2, MF4x4(0.3276134, 0.21952826, -0.80777377, -0.69810224, 0.34190908, -0.09293263, 0.33313555, -0.27255502, -0.24287084, -0.07741488, 0.06090265, -0.10161252, -0.37684909, 0.4678029, 0.13506591, 0.42470258), target1);
	target1 = MulAdd(nf2, MF4x4(0.080790855, -0.09707547, -0.05506975, 0.027011644, -0.1434346, 0.01363872, 0.12616752, 0.16789167, 0.1656414, -0.11586835, 0.059612263, -0.074029386, -0.19813071, 0.46032718, -0.03935981, 0.0067143585), target1);
	target1 = MulAdd(ng2, MF4x4(0.10322512, 0.0822636, -0.16766444, 0.041008063, -0.027768405, 0.23103505, 0.06737122, 0.15258405, 0.04557388, -0.18179403, 0.12489025, -0.09759324, -0.05925805, 0.04869987, 0.07329833, -0.09738542), target1);
	target1 = MulAdd(nh2, MF4x4(-0.10823879, -0.403376, 0.3264802, -0.16503738, -0.057512645, -0.20902547, -0.14862378, -0.3192005, -0.046263676, 0.12744917, -0.019174274, -0.02318789, -0.085088454, -0.05723332, 0.0039772973, 0.07991316), target1);
	target1 = MulAdd(ni2, MF4x4(0.10313916, 0.04410904, 0.03286652, 0.059946325, 0.019948404, 0.070217304, -0.017572487, 0.20332281, 0.06776308, 0.029285522, -0.14116238, -0.05864782, -0.18382367, -0.06568212, 0.11855615, 0.101256005), target1);
	
	MF4 target2 = { -0.010602045, 0.053976092, 0.008913503, 0.0011945717 };
	target2 = MulAdd(a1, MF4x4(0.059325468, 0.10884231, 0.018158086, 0.031802185, 0.10368743, -0.06776637, 0.048326045, -0.06312353, -0.0025675546, 0.09309577, -0.025533969, 0.029684044, 0.017237723, 0.062099144, 0.047039766, 0.050348036), target2);
	target2 = MulAdd(b1, MF4x4(-0.04767078, -0.06409279, 0.112965874, 0.04621161, -0.28172916, -0.13897015, -0.022806352, 0.26966885, 0.02019569, -0.10707113, -0.43058416, -0.14103983, -0.13225646, -0.020053176, -0.17319782, -0.009653082), target2);
	target2 = MulAdd(c1, MF4x4(0.0031349238, -0.060933832, 0.107986666, -0.019791966, -0.23946726, -0.18045186, 0.18286318, -0.05431065, 0.11742379, -0.019123906, 0.33327517, 0.07455424, -0.035427105, 0.18659347, -0.050884776, 0.019193258), target2);
	target2 = MulAdd(d1, MF4x4(-0.22954239, 0.011265787, -0.026520751, -0.12629737, -0.07009803, 0.44925988, -0.15938939, 0.11956771, 0.11535644, -0.1302371, 0.1235775, 0.16483483, 0.022965495, 0.110546246, 0.00064579415, -0.12753843), target2);
	target2 = MulAdd(e1, MF4x4(0.047553673, 0.16213869, 0.7510964, 0.21228868, 0.40994287, 0.61919236, 0.3982374, -0.016163021, 0.3291035, 0.1134356, 0.12384387, -0.31114763, 0.21338554, -0.04721641, 0.122114286, 0.2717476), target2);
	target2 = MulAdd(f1, MF4x4(-0.06529201, -0.08936482, 0.031857736, -0.02372691, 0.0416097, 0.28484538, -0.38181338, -0.05129518, 0.40150553, -0.01970737, 0.1043854, 0.11986372, -0.2267319, 0.0014845231, -0.035269983, 0.11712099), target2);
	target2 = MulAdd(g1, MF4x4(0.079867415, -0.09982735, 0.10313241, 0.055490237, -0.42685422, -0.3431141, -0.06037366, 0.17539841, -0.010511819, -0.09743252, 0.050748866, 0.11064108, -0.09785722, -0.10230299, -0.04106169, -0.016831731), target2);
	target2 = MulAdd(h1, MF4x4(-0.06847075, -0.026447225, -0.123430386, 0.063637204, -0.37617612, -0.09615662, -0.26226708, -0.008175561, -0.08101131, 0.11093525, -0.13149206, -0.06363292, -0.0482858, -0.2771799, 0.10528571, 0.119109035), target2);
	target2 = MulAdd(i1, MF4x4(0.09151277, 0.029019276, 0.041349206, -0.011239478, 0.035083957, 0.05281079, -0.0742173, -0.018509442, -0.17175299, -0.4226507, -0.118186444, -0.0771296, 0.107038856, 0.0819975, 0.12445646, 0.07091557), target2);
	target2 = MulAdd(a2, MF4x4(0.1275357, -0.097659886, -0.0114354445, 0.023900568, -0.02511702, 0.005830569, -0.010882143, -0.04046068, -0.08638482, 0.08664022, -0.15654318, 0.03333846, -0.12521335, -0.11987078, 0.028556254, -0.020760164), target2);
	target2 = MulAdd(b2, MF4x4(-0.38474286, -0.15288061, 0.04925842, 0.050009686, 0.23555282, 0.054784663, -0.0971203, 0.017791113, -0.35539824, -0.08806168, 0.08992579, 0.22714761, -0.047685623, -0.17510797, 0.1137738, -0.069451034), target2);
	target2 = MulAdd(c2, MF4x4(-0.16623408, -0.08202571, -0.03291826, 0.0016267949, 0.20682698, 0.08788948, 0.10241089, 0.019209227, -0.14802241, 0.091788374, -0.238735, -0.06633396, 0.02360112, 0.1521805, -0.022510838, -0.08931379), target2);
	target2 = MulAdd(d2, MF4x4(0.034280665, -0.12431295, 0.092791, 0.15279225, -0.43373865, 0.20077267, -0.15919733, -0.27969292, -0.26948065, 0.19652127, -0.27456176, 0.04137772, 0.006545539, 0.0031402514, 0.03849979, -0.10978278), target2);
	target2 = MulAdd(e2, MF4x4(0.62025917, -0.32462567, 0.2817292, -0.18380783, -0.3338593, -0.49056754, 0.32645953, 0.4146035, 0.3773462, 0.54346967, -0.032203436, -0.14506778, -0.30044907, 0.40134314, 0.24155408, 0.24397472), target2);
	target2 = MulAdd(f2, MF4x4(0.089335114, -0.05529855, -0.18364899, -0.153323, -0.18347202, -0.060125064, -0.29216367, -0.2717291, 0.10592963, 0.38889876, 0.25363386, 0.33723134, -0.103703365, 0.14922962, -0.21206948, -0.20289616), target2);
	target2 = MulAdd(g2, MF4x4(-0.035760924, 0.18820894, -0.12723185, -0.018780319, 0.124459654, 0.28909087, -0.2763883, -0.45110545, 0.098143585, 0.16052029, -0.055098705, -0.14840914, -0.0019514654, 0.07090622, -0.055036955, -0.0035953245), target2);
	target2 = MulAdd(h2, MF4x4(-0.124669634, 0.23131305, -0.05750295, -0.056296032, 0.35691026, 0.2640789, 0.49912274, 0.26795143, -0.26460487, -0.026896512, -0.07179325, 0.17373477, -0.13186656, 0.0021319336, -0.016407885, 0.3014283), target2);
	target2 = MulAdd(i2, MF4x4(-0.09491939, 0.11503968, -0.14077829, -0.043197304, -0.061866064, -0.1574549, 0.0054375776, 0.066160634, -0.17686372, -0.26767558, -0.038844116, 0.122724466, -0.05043839, 0.063884266, 0.0064002997, -0.13583377), target2);
	target2 = MulAdd(na1, MF4x4(0.031301867, -0.02947819, -0.0016769855, 0.12952408, -0.025022922, 0.065425046, -0.072289295, -0.071249105, 0.14579567, -0.09058119, 0.12663712, 0.1515388, 0.44767743, 0.02971349, 0.015892735, -0.08058422), target2);
	target2 = MulAdd(nb1, MF4x4(-0.2868111, -0.10812653, -0.29182926, -0.38444322, -0.0875354, -0.07220258, 0.05978065, 0.093328245, 0.058548283, -0.013913258, -0.20954674, -0.16400063, 0.3185215, 0.068897314, 0.15869021, 0.022877626), target2);
	target2 = MulAdd(nc1, MF4x4(0.116845705, -0.12729645, 0.056697316, -0.21263942, -0.07000074, 0.073977455, -0.09006404, -0.029770354, -0.20823102, -0.20088868, 0.15658094, 0.24306639, -0.0453592, -0.16011035, 0.08521533, -0.032264974), target2);
	target2 = MulAdd(nd1, MF4x4(0.1114789, -0.1083731, 0.10465276, -0.08903837, -0.06455987, 0.040030345, -0.07937248, -0.20654759, -0.26873547, -0.19390975, -0.039021965, -0.025602374, -0.5575801, -0.08876011, -0.19116728, -0.2401055), target2);
	target2 = MulAdd(ne1, MF4x4(0.37626424, -0.0912155, -0.6153361, -0.71465075, 0.018208932, -0.14997734, 0.23627761, 0.20832567, 0.07427123, -0.37869486, -0.26574427, 0.187582, -0.37201726, 0.17809474, -0.02568795, 0.23900814), target2);
	target2 = MulAdd(nf1, MF4x4(-0.085337594, -0.50634587, 0.30636734, -0.2760558, 0.01893911, -0.08425695, -0.023656169, 0.021421626, 0.16813251, -0.039550815, 0.21165498, -0.027628547, -0.123874225, 0.013802332, -0.2732087, -0.09419671), target2);
	target2 = MulAdd(ng1, MF4x4(-0.07190724, -0.019237598, 0.020249542, 0.07541295, -0.03817686, 0.09266451, -0.12214172, -0.01344174, 0.03281797, 0.057655178, -0.059896503, 0.014948791, -0.13952477, 0.18810949, -0.19016883, 0.06842416), target2);
	target2 = MulAdd(nh1, MF4x4(-0.13111524, 0.14539744, -0.10212538, -0.2169032, 0.13810973, -0.12576458, 0.124372825, 0.04992259, 0.21758182, -0.22160134, 0.24321079, 0.017698256, 0.39995426, 0.074034885, 0.120019354, -0.15522505), target2);
	target2 = MulAdd(ni1, MF4x4(0.023914235, 0.1424257, 0.010302871, 0.15150794, -0.040021677, 0.015862139, 0.14459212, 0.08632827, 0.04257336, 0.055059638, 0.0030461506, 0.011985334, -0.049230937, 0.07851301, -0.05119983, -0.111701734), target2);
	target2 = MulAdd(na2, MF4x4(0.04485158, 0.116597414, 0.00014909732, -0.012128512, 0.15801767, 0.18273115, -0.033926453, 0.05170487, -0.040683754, -0.18606974, 0.08324687, 0.069539666, 0.07098698, -0.014132968, 0.029499048, -0.07263477), target2);
	target2 = MulAdd(nb2, MF4x4(0.04309544, 0.089722805, -0.018306322, 0.29061043, 0.15191254, 0.15917647, 0.0073858183, 0.039199475, 0.42514518, -0.053955313, 0.10820046, -0.09134685, -0.3087313, -0.16339037, -0.05226669, 0.044995327), target2);
	target2 = MulAdd(nc2, MF4x4(0.008636428, 0.029086163, -0.09151674, -0.36466715, -0.0128008155, 0.018820466, -0.02700147, -0.0064047636, 0.28287655, 0.02709404, -0.05233492, -0.08967187, -0.042183813, -0.13990502, -0.005085154, -0.028511493), target2);
	target2 = MulAdd(nd2, MF4x4(0.00022532263, -0.09108507, 0.0089569865, 0.052016005, -0.19314727, -0.355347, 0.08082937, 0.2134498, 0.21036889, -0.10165983, 0.20334485, 0.14575538, 0.017676214, -0.13149881, -0.018741794, -0.019599862), target2);
	target2 = MulAdd(ne2, MF4x4(-0.20513605, 0.47578803, -0.18631598, 0.2535432, -0.049522053, -0.37224755, 0.11227206, -0.37000927, 0.19969453, -0.47287735, -0.07506754, -0.0957071, 0.82927394, -0.54057014, 0.5800732, 0.08937558), target2);
	target2 = MulAdd(nf2, MF4x4(-0.022189412, 0.14622113, -0.4772564, -0.31178755, 0.10667427, -0.07335338, 0.06144331, 0.00056827103, -0.08263861, -0.009126272, -0.22802618, -0.20760304, 0.12688845, -0.061324466, 0.33361357, 0.38350767), target2);
	target2 = MulAdd(ng2, MF4x4(0.021188622, 0.1151918, -0.10654739, -0.03341855, 0.24870358, -0.06689332, 0.11881217, -0.0045951125, -0.039464932, -0.030190004, 0.014174111, -0.025356272, 0.07469406, -0.0059695644, 0.008267219, -0.0991054), target2);
	target2 = MulAdd(nh2, MF4x4(-0.009981438, -0.36484948, 0.04801225, 0.22368562, -0.055985868, 0.229039, -0.10823553, 0.1477355, -0.0091677625, 0.06279847, 0.034393013, 0.031901076, 0.28783056, 0.086422645, 0.20860936, 0.054018307), target2);
	target2 = MulAdd(ni2, MF4x4(-0.08720452, -0.07756267, 0.018853918, -0.014108689, -0.019337144, 0.021249043, -0.05633926, -0.109904505, -0.088990815, 0.16876367, -0.13149975, -0.054357648, 0.08588134, -0.10262266, 0.12052009, 0.05154292), target2);

	conv2d_6_tf[gxy] = target1;
	conv2d_6_tf1[gxy] = target2;
}

//!PASS 8
//!DESC Conv-4x1x1x112, Depth-to-Space
//!IN INPUT, conv2d_tf, conv2d_tf1, conv2d_1_tf, conv2d_1_tf1, conv2d_2_tf, conv2d_2_tf1, conv2d_3_tf, conv2d_3_tf1, conv2d_4_tf, conv2d_4_tf1, conv2d_5_tf, conv2d_5_tf1, conv2d_6_tf, conv2d_6_tf1
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64

void Pass8(uint2 blockStart, uint3 threadId) {
	uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;
	
	const uint2 outputSize = GetOutputSize();
	if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	float2 pos = ((gxy >> 1) + 0.5f) * inputPt;

	MF4 g0 = conv2d_tf.SampleLevel(sam, pos, 0);
	MF4 g1 = conv2d_tf1.SampleLevel(sam, pos, 0);
	MF4 g2 = conv2d_1_tf.SampleLevel(sam, pos, 0);
	MF4 g3 = conv2d_1_tf1.SampleLevel(sam, pos, 0);
	MF4 g4 = conv2d_2_tf.SampleLevel(sam, pos, 0);
	MF4 g5 = conv2d_2_tf1.SampleLevel(sam, pos, 0);
	MF4 g6 = conv2d_3_tf.SampleLevel(sam, pos, 0);
	MF4 g7 = conv2d_3_tf1.SampleLevel(sam, pos, 0);
	MF4 g8 = conv2d_4_tf.SampleLevel(sam, pos, 0);
	MF4 g9 = conv2d_4_tf1.SampleLevel(sam, pos, 0);
	MF4 g10 = conv2d_5_tf.SampleLevel(sam, pos, 0);
	MF4 g11 = conv2d_5_tf1.SampleLevel(sam, pos, 0);
	MF4 g12 = conv2d_6_tf.SampleLevel(sam, pos, 0);
	MF4 g13 = conv2d_6_tf1.SampleLevel(sam, pos, 0);

	MF4 ng0 = max(-g0, 0);
	MF4 ng1 = max(-g1, 0);
	MF4 ng2 = max(-g2, 0);
	MF4 ng3 = max(-g3, 0);
	MF4 ng4 = max(-g4, 0);
	MF4 ng5 = max(-g5, 0);
	MF4 ng6 = max(-g6, 0);
	MF4 ng7 = max(-g7, 0);
	MF4 ng8 = max(-g8, 0);
	MF4 ng9 = max(-g9, 0);
	MF4 ng10 = max(-g10, 0);
	MF4 ng11 = max(-g11, 0);
	MF4 ng12 = max(-g12, 0);
	MF4 ng13 = max(-g13, 0);

	g0 = max(g0, 0);
	g1 = max(g1, 0);
	g2 = max(g2, 0);
	g3 = max(g3, 0);
	g4 = max(g4, 0);
	g5 = max(g5, 0);
	g6 = max(g6, 0);
	g7 = max(g7, 0);
	g8 = max(g8, 0);
	g9 = max(g9, 0);
	g10 = max(g10, 0);
	g11 = max(g11, 0);
	g12 = max(g12, 0);
	g13 = max(g13, 0);

	MF4 target1 = { -0.11952045, -0.10779418, -0.0626279, -0.042614873 };
	target1 = MulAdd(g0, MF4x4(-0.105475314, -0.07022547, -0.16326137, -0.12503424, -0.004623021, -0.0143323885, 0.042996034, 0.03422294, -0.38310882, -0.4431925, -0.28772846, -0.3213578, -0.018014904, 0.02429277, -0.07177951, -0.04458822), target1);
	target1 = MulAdd(g1, MF4x4(-0.0973233, -0.032439478, -0.08420249, -0.054693196, 0.012960555, 0.06929602, 0.004247494, 0.061315402, -0.09607745, -0.16862066, 0.01537482, -0.038459156, 0.019662246, 0.059920583, -0.1071646, -0.06478967), target1);
	target1 = MulAdd(ng0, MF4x4(0.15711947, 0.0754732, 0.17891979, 0.098270796, 0.14122486, 0.14893766, 0.12408279, 0.14845194, 0.16199848, 0.14090912, 0.13496809, 0.1119815, 0.03974558, -0.057513904, 0.09213575, -0.0012252429), target1);
	target1 = MulAdd(ng1, MF4x4(-0.011343602, -0.02488338, 0.07799659, 0.06503721, 0.06380687, 0.048929837, -0.05555838, -0.050519127, 0.14673206, 0.18085165, 0.07261422, 0.09738158, 0.07395791, 0.005573146, -0.05454926, -0.13565786), target1);
	target1 = MulAdd(g2, MF4x4(-0.08591514, -0.05664865, 0.23980616, 0.24876402, 0.19052829, 0.011938714, 0.21487322, 0.058656186, 0.036630988, 0.14918756, 0.013127693, 0.13092093, -0.37889576, -0.4068804, -0.27258882, -0.30605716), target1);
	target1 = MulAdd(g3, MF4x4(-0.25149816, -0.21979512, -0.24949454, -0.20483162, -0.10972783, -0.17315808, -0.08562763, -0.16086778, 0.044681527, 0.050807394, -0.019424994, -0.022418005, 0.10039492, -0.013666552, -0.22373566, -0.34493732), target1);
	target1 = MulAdd(ng2, MF4x4(0.1419155, 0.081392206, -0.18103191, -0.2122926, -0.1445937, -0.015969204, -0.12368782, -0.0044421684, -0.09534078, -0.14815839, -0.1052107, -0.16341865, 0.3050403, 0.34488317, 0.16171226, 0.18700944), target1);
	target1 = MulAdd(ng3, MF4x4(0.12444696, 0.08712589, 0.06266247, 0.031022022, 0.17707655, 0.24904409, 0.20961654, 0.2610619, -0.099262595, -0.06900819, -0.034567446, -0.020191457, -0.1468561, -0.04683958, 0.14910224, 0.244686), target1);
	target1 = MulAdd(g4, MF4x4(-0.002428158, -0.012889509, 0.0006541127, -0.0058380975, 0.096147396, 0.07791617, 0.119144954, 0.11699654, -0.024602454, -0.07894611, -0.00021709128, -0.03979557, 0.0028512406, -0.015790012, 0.0082511455, 0.029357092), target1);
	target1 = MulAdd(g5, MF4x4(-0.01410329, -0.004162405, -0.09005045, -0.07753674, 0.004509965, -0.024188736, 0.13799691, 0.10589621, -0.023018798, 0.0064198375, -0.103344224, -0.07463909, -0.060048997, -0.071094714, -0.13042289, -0.14482167), target1);
	target1 = MulAdd(ng4, MF4x4(-0.009015246, 0.01581748, -0.035448726, -0.012348933, -0.101627484, -0.05530413, -0.14063041, -0.121775225, 0.074719116, 0.033839386, 0.045573987, -0.006698053, 0.0015141299, 0.003634417, 0.017102007, 0.0074890694), target1);
	target1 = MulAdd(ng5, MF4x4(0.0042357175, 0.018735386, 0.058959343, 0.057424515, -0.021633089, -0.037194982, -0.14109972, -0.1506368, 0.004357002, -0.006871023, 0.05337361, 0.039684236, 0.087463334, 0.07772685, 0.12278512, 0.1224218), target1);
	target1 = MulAdd(g6, MF4x4(0.018359886, 0.046934873, -0.008225237, 0.020650858, -0.03961538, -0.014779162, -0.04161338, -0.00953579, 0.0017313146, 0.0068857935, -0.0024282748, 0.0047545764, 0.02635904, 0.027336216, 0.02701322, 0.029939381), target1);
	target1 = MulAdd(g7, MF4x4(-0.00067966996, 0.024480496, -0.015218739, -0.010472019, -0.03994461, -0.052318517, -0.04450191, -0.043226667, -0.03166469, -0.03799331, 0.015428865, -0.018422252, 0.00040845043, 0.03558268, -0.0099401595, -0.00054432114), target1);
	target1 = MulAdd(ng6, MF4x4(-0.0032104475, 0.019604867, -0.02486679, 0.002134673, 0.014368818, -0.0013395248, 0.017318068, 0.0021403218, -0.02198377, 0.010297547, -0.041619625, -0.02740482, -0.067249276, -0.03040953, -0.021304253, -0.009557115), target1);
	target1 = MulAdd(ng7, MF4x4(-0.019099236, -0.037010793, 0.013720462, 0.023708181, 0.016356282, -0.00028589502, -0.010570909, -0.009186907, 0.03493662, 0.055599142, -0.017043956, 0.004204044, -0.013573257, -0.013537684, 0.008151195, 0.0074913655), target1);
	target1 = MulAdd(g8, MF4x4(0.009309031, -0.0014795153, 0.025114728, -0.0066442797, -0.012085473, -0.0030560147, 0.002144206, 0.0009732741, 0.022301642, -0.0091133695, 0.0011837826, -0.020275833, -0.021349607, -0.011693419, -0.018912962, -0.022418445), target1);
	target1 = MulAdd(g9, MF4x4(-0.0045772395, 0.031085191, 0.01215795, 0.023887333, 0.023408212, 0.0005998807, 0.011254428, -0.004634461, 0.016601006, 0.046663348, 0.031117432, 0.04910873, -0.113230005, -0.035702843, -0.058746565, -0.053893737), target1);
	target1 = MulAdd(ng8, MF4x4(-0.020218112, 0.056803435, -0.0037077996, 0.05123925, -0.016713811, -0.05551032, -0.005916611, -0.037839632, -0.007671626, -0.009099201, -0.0010055836, 0.003332688, 0.020744357, 0.01957675, 0.057906736, 0.041446246), target1);
	target1 = MulAdd(ng9, MF4x4(0.022438819, 0.04616756, 0.035925094, 0.0639705, 0.0009332198, 0.020964272, -0.010805394, 0.031757344, 0.051255573, 0.032838948, 0.00055445684, -0.03195623, 0.04753827, 0.016436901, 0.04788274, 0.022093765), target1);
	target1 = MulAdd(g10, MF4x4(0.03479086, 0.035946105, 0.04343359, 0.04015664, 0.06081792, 0.061758887, 0.10128842, 0.007471392, -0.027261607, -0.01290544, -0.029938918, -0.050834358, -0.015550162, 0.0072828676, -0.04580556, -0.029642029), target1);
	target1 = MulAdd(g11, MF4x4(0.011150116, 0.029789668, -0.00354488, 0.045047592, -0.018265083, -0.020843878, 0.015457328, 0.0053232997, 0.0791804, -0.028661052, 0.079342775, -0.039631505, 0.14613943, 0.08323415, 0.049641483, 0.047863442), target1);
	target1 = MulAdd(ng10, MF4x4(-0.103034586, -0.107580125, 0.00044325445, 0.007830247, -0.017059505, 0.010152936, -0.02845979, -0.01841766, -0.10722863, -0.025262646, -0.07402096, -0.025055556, 0.0013303137, 0.12574737, -0.0161103, 0.06077798), target1);
	target1 = MulAdd(ng11, MF4x4(-0.0420636, -0.062703885, -0.06476972, -0.10516001, 0.018120673, 0.024305122, -0.013997766, 0.015815413, -0.06317691, -0.03968166, -0.054052643, -0.016300509, -0.08255892, -0.01612941, -0.04194852, -0.012637189), target1);
	target1 = MulAdd(g12, MF4x4(0.042659573, -0.10762496, -0.077143244, 0.12583935, -0.022020226, -0.0042312425, -0.016734738, 0.027007964, -0.06609771, -0.056038737, -0.0058528963, 0.035508137, -0.019722374, -0.055094264, 0.010977759, -0.009833099), target1);
	target1 = MulAdd(g13, MF4x4(0.063830875, -0.019885639, 0.055574782, 0.039456647, 0.01576898, -0.1389799, 0.063411795, -0.11600623, -0.013968303, -0.03318867, -0.06806915, -0.09373464, -0.022723546, -0.03329239, 0.014282872, 0.027576538), target1);
	target1 = MulAdd(ng12, MF4x4(-0.018100513, 0.06204485, 0.010761461, -0.045085587, 0.009286288, 0.02310671, 0.10633246, -0.090849996, 0.13112675, -0.01639808, 0.0022725316, -0.076779045, 0.11831251, 0.1460306, -0.10849466, -0.07749171), target1);
	target1 = MulAdd(ng13, MF4x4(-0.15850247, 0.118011266, -0.10121594, -0.007109052, 0.071873754, 0.06954878, 0.0377852, 0.044174008, -0.062925555, -0.01758927, 0.1416964, 0.17206357, -0.035632525, -0.04652215, 0.061932907, 0.034339), target1);

	MF4 target2 = { 0.05825913, 0.051491056, 0.038389463, 0.03321517 };
	target2 = MulAdd(g0, MF4x4(-0.009000901, -0.018048609, 0.013095594, 0.002321373, 0.0004716619, 0.00504148, -0.016826658, -0.014922383, 0.15059204, 0.16593806, 0.115392484, 0.12520894, 0.05049829, 0.060210057, 0.086421266, 0.07242362), target2);
	target2 = MulAdd(g1, MF4x4(0.06268658, 0.030466434, 0.07876877, 0.04129863, 0.04142328, 0.009963961, 0.051785357, 0.012811113, 0.1295883, 0.139931, 0.07733839, 0.08014211, 0.07156476, 0.0342396, 0.051614303, 0.043559864), target2);
	target2 = MulAdd(ng0, MF4x4(0.00041542648, 0.016051646, -0.011512418, 0.013076814, 0.03734479, 0.02791584, 0.012426691, 0.022044811, -0.034128398, -0.027107332, -0.021998279, -0.012139807, -0.033177473, -0.016310865, -0.078221664, -0.041203145), target2);
	target2 = MulAdd(ng1, MF4x4(-0.008398536, -0.010332053, -0.050231732, -0.039691273, -0.042082537, -0.030281143, -0.014039778, -0.0020190612, -0.11956351, -0.13638765, -0.09794402, -0.10228069, -0.08344795, -0.07944541, -0.004189214, -0.028206991), target2);
	target2 = MulAdd(g2, MF4x4(0.0002908945, -0.00831185, -0.06870294, -0.083311856, -0.024992501, 0.0038247898, -0.049389005, -0.020098582, -0.0135326125, -0.040408995, -0.012083491, -0.042174604, 0.16112538, 0.13720983, 0.13937058, 0.10870099), target2);
	target2 = MulAdd(g3, MF4x4(0.078961425, 0.082619205, 0.06910667, 0.06579004, -0.0077012256, -0.00038692637, 0.00015553503, -0.012561662, 0.00053048285, -0.01461681, 0.02600344, 0.024862211, -0.06958201, -0.048246548, 0.058762506, 0.036662634), target2);
	target2 = MulAdd(ng2, MF4x4(-0.023527982, -0.0028001352, 0.047800142, 0.09616409, 0.049143843, 0.030836122, 0.057244994, 0.025672587, 0.027565151, 0.039868724, 0.045296676, 0.04623187, -0.124759234, -0.14106254, -0.06337279, -0.076839216), target2);
	target2 = MulAdd(ng3, MF4x4(-0.0911771, -0.064436875, -0.05308137, -0.022082496, -0.0040269364, 0.0014464161, -0.0029555515, 0.016098293, -0.026650434, -0.014081368, -0.06747348, -0.05481826, 0.097423114, 0.08620988, -0.01607732, -0.015440677), target2);
	target2 = MulAdd(g4, MF4x4(-0.014001735, -0.015001655, -0.013250577, -0.009930805, 0.04885879, 0.07092224, 0.025783395, 0.03792237, -0.04332465, -0.06244993, -0.046748653, -0.07132349, -0.0053951666, -0.016514057, 0.023807624, 0.044013456), target2);
	target2 = MulAdd(g5, MF4x4(-0.009097996, -0.016898679, -0.05043909, -0.063178614, -0.016210863, -0.02157998, -0.02654472, -0.042961173, 0.012103852, 0.019015301, 0.02492281, 0.03389976, 0.015276502, 0.009577683, 0.04132527, -0.00070621347), target2);
	target2 = MulAdd(ng4, MF4x4(-0.0057500796, 0.00728164, -0.003422421, 0.0038979584, -0.03127353, -0.019125199, -0.012988815, -0.031890683, 0.09352588, 0.019210607, 0.09824038, 0.016637104, 0.010692808, 0.022393884, 0.008312123, 0.014120716), target2);
	target2 = MulAdd(ng5, MF4x4(0.013895599, 0.023097904, 0.009370535, 0.014099512, 0.0124661345, -0.015076684, 0.03287286, 0.005912471, -0.03944815, -0.020340785, -0.06822037, -0.059383288, 0.03634978, 0.007832939, -0.007142306, -0.0061968984), target2);
	target2 = MulAdd(g6, MF4x4(0.033002097, 0.0516016, -0.021056438, 0.005715988, -0.02223013, -0.007962324, -0.024417123, -0.0014790733, 0.002167189, 0.00043749413, -0.007284963, -0.0027283782, 0.026238248, 0.01756047, 0.008969755, 0.014201024), target2);
	target2 = MulAdd(g7, MF4x4(0.011576685, 0.02087598, 0.0026766327, -0.0041780816, -0.05277701, -0.05412841, -0.05958835, -0.050426245, -0.00662945, -0.021645393, 0.03423904, -0.0064581474, -0.030403355, 0.018391011, -0.026089542, -0.0051510665), target2);
	target2 = MulAdd(ng6, MF4x4(-0.046202097, -0.0066081425, -0.03698851, 0.0034165455, -0.011859245, -0.020945566, -0.0028196946, -0.010053285, -0.011400397, 0.030595876, -0.018915813, 0.006780077, -0.060040582, -0.009586898, -0.004477886, 0.011279908), target2);
	target2 = MulAdd(ng7, MF4x4(-0.028692413, -0.032535568, 0.0017473884, 0.02207169, 0.0192618, 0.008956797, -0.0033381556, 0.006326402, 0.0169569, 0.041449737, -0.02611751, 0.0006410355, 0.006233776, 0.0008467914, 0.011884985, 0.009222136), target2);
	target2 = MulAdd(g8, MF4x4(0.017076496, -0.0045380928, 0.03444613, -0.009804047, -0.004829834, -0.004889702, 0.0057807956, 0.0015014127, 0.03458368, -0.0035773432, -0.007769679, -0.032449644, -0.021396799, -0.017612215, -0.012764735, -0.025224172), target2);
	target2 = MulAdd(g9, MF4x4(-0.011824532, 0.02335273, 0.00764845, 0.019215155, 0.022186808, 0.0066053392, 0.0071694753, -0.0036117272, 0.032144524, 0.05025988, 0.03982363, 0.052400436, -0.10555114, -0.03809396, -0.05334183, -0.05524487), target2);
	target2 = MulAdd(ng8, MF4x4(-0.024599254, 0.058805298, 0.00069874676, 0.06263439, -0.018460508, -0.053566024, -0.0022889362, -0.035818785, -0.0135854995, -0.015712813, 0.0012080368, 0.005957637, 0.009450094, 0.03186346, 0.059969924, 0.057706963), target2);
	target2 = MulAdd(ng9, MF4x4(0.026783831, 0.05475865, 0.027565574, 0.06032707, -0.0015639095, 0.024381682, -0.010199071, 0.037544634, 0.039889377, 0.03318851, -0.016529158, -0.0343188, 0.045666486, 0.021665907, 0.042189375, 0.02444145), target2);
	target2 = MulAdd(g10, MF4x4(0.03791853, 0.043746054, 0.056224477, 0.05098111, 0.075256795, 0.074653305, 0.116220035, 0.01853866, -0.04133627, -0.009134169, -0.0420953, -0.05210053, -0.021748418, 0.004422131, -0.05422814, -0.035721727), target2);
	target2 = MulAdd(g11, MF4x4(0.013814317, 0.03149986, -0.004971173, 0.04782029, -0.01693027, -0.017984565, 0.019328078, 0.008521426, 0.0845641, -0.027555496, 0.08150416, -0.04623306, 0.16494128, 0.09300831, 0.074097835, 0.0627848), target2);
	target2 = MulAdd(ng10, MF4x4(-0.10307174, -0.112654425, -0.005589254, -0.0062108496, -0.012491583, 0.011512013, -0.03142282, -0.023683488, -0.099848576, -0.031290524, -0.07236223, -0.037460987, 0.008760208, 0.1473594, -0.009216949, 0.07251379), target2);
	target2 = MulAdd(ng11, MF4x4(-0.04915367, -0.07121096, -0.06572174, -0.10967046, 0.019548079, 0.023992533, -0.019842865, 0.012366459, -0.07207817, -0.04237792, -0.054463565, -0.015374731, -0.092071235, -0.020860313, -0.054475963, -0.02303954), target2);
	target2 = MulAdd(g12, MF4x4(0.04160816, -0.118427366, -0.08661791, 0.12787233, -0.01990174, 0.0012960634, -0.016121056, 0.031429946, -0.06830865, -0.057132352, -0.0022302791, 0.03845933, -0.026981276, -0.063532256, 0.011805961, -0.009616678), target2);
	target2 = MulAdd(g13, MF4x4(0.07094465, -0.022284096, 0.060676746, 0.042626668, 0.011207256, -0.14960343, 0.05866539, -0.12742221, -0.021092903, -0.039463162, -0.07879986, -0.10232898, -0.026127055, -0.038111385, 0.019167708, 0.032637425), target2);
	target2 = MulAdd(ng12, MF4x4(-0.014270794, 0.07157703, 0.013714203, -0.047801998, 0.0060221693, 0.022788104, 0.10630103, -0.09606649, 0.12690987, -0.017079826, -0.0077072172, -0.082584485, 0.13256006, 0.16012523, -0.10966099, -0.07927409), target2);
	target2 = MulAdd(ng13, MF4x4(-0.17171615, 0.12114435, -0.10746857, -0.0074188868, 0.07854815, 0.07759372, 0.04310874, 0.051465522, -0.05963588, -0.014506484, 0.15522978, 0.18746643, -0.03845241, -0.0489534, 0.05837339, 0.032978524), target2);

	MF4 target3 = { 0.06614842, 0.045779686, 0.032838725, 0.017085627 };
	target3 = MulAdd(g0, MF4x4(0.2006987, 0.17832398, 0.26342955, 0.23500517, -0.012297829, -0.009008417, -0.039950736, -0.039973143, 0.34800097, 0.32196492, 0.30505183, 0.29214156, -0.21410535, -0.21166423, -0.16437815, -0.19172792), target3);
	target3 = MulAdd(g1, MF4x4(-0.008804151, -0.07085123, 0.013577994, -0.05192605, -0.08981402, -0.14702585, -0.09145975, -0.14835288, -0.15882517, -0.14785844, -0.2381482, -0.22867912, 0.010898514, 0.031957507, 0.040597558, 0.078252345), target3);
	target3 = MulAdd(ng0, MF4x4(-0.21658613, -0.1803885, -0.25954962, -0.20839214, -0.09597461, -0.09222647, -0.03909875, -0.03456191, -0.19723509, -0.16976605, -0.2041716, -0.1751425, 0.22901416, 0.24922715, 0.1800083, 0.23346905), target3);
	target3 = MulAdd(ng1, MF4x4(0.110020064, 0.103858806, 0.052446555, 0.061105963, 0.032901537, 0.07140097, 0.11518793, 0.13860466, 0.13930707, 0.12712196, 0.19071707, 0.18399614, -0.08036458, -0.07349171, 0.021504594, 0.0024937368), target3);
	target3 = MulAdd(g2, MF4x4(0.059065036, 0.00698257, -0.099622436, -0.15676253, -0.10942482, -0.04869624, -0.13654554, -0.07341863, -0.014169851, -0.06390744, 0.016093008, -0.04540248, 0.29041344, 0.24451919, 0.26292154, 0.22136512), target3);
	target3 = MulAdd(g3, MF4x4(0.107946776, 0.097849295, 0.10266876, 0.09360328, 0.08931344, 0.08896482, 0.046495322, 0.044040844, -0.020361643, -0.030911373, 0.026598722, 0.019815676, -0.072677925, -0.042410247, 0.14127749, 0.13434973), target3);
	target3 = MulAdd(ng2, MF4x4(-0.08809133, -0.03476601, 0.06420393, 0.14691353, 0.09296839, 0.06162562, 0.10992992, 0.0615685, 0.0168736, 0.06520281, 0.020010693, 0.046929173, -0.2219495, -0.21249783, -0.14622301, -0.14599061), target3);
	target3 = MulAdd(ng3, MF4x4(-0.13251069, -0.08977477, -0.08930347, -0.045490693, -0.10980218, -0.09510885, -0.07299872, -0.064053826, 0.011365247, 0.014091111, -0.054976214, -0.056936122, 0.10148144, 0.07451642, -0.08138598, -0.10161657), target3);
	target3 = MulAdd(g4, MF4x4(-0.0075518745, -0.005738622, -0.007577811, -0.00032088626, 0.032614008, 0.04858922, 0.00054855715, 0.011565026, -0.022675224, -0.034442738, -0.03580643, -0.05069376, -0.0020376542, -0.01505518, 0.019388825, 0.03746554), target3);
	target3 = MulAdd(g5, MF4x4(-0.011413172, -0.016877454, -0.048923567, -0.055012885, -0.007709447, -0.016109072, -0.047132388, -0.07146396, 0.002604099, 0.00014681708, 0.03429465, 0.043265607, 0.029014807, 0.03337814, 0.07582056, 0.041660666), target3);
	target3 = MulAdd(ng4, MF4x4(-0.020768544, -0.014378527, -0.01999972, -0.01385916, -0.012264676, -0.009959511, 0.0119015165, -0.016787319, 0.07266734, -0.0029914333, 0.08549183, 0.004367342, 0.008236065, 0.020370748, 0.0043428927, 0.007301017), target3);
	target3 = MulAdd(ng5, MF4x4(0.011654731, 0.025318999, -0.0029306612, 0.007426217, -0.00010868774, -0.020845588, 0.041991003, 0.024147986, -0.030741083, -0.012328637, -0.06617428, -0.06103115, 0.010491518, -0.013338451, -0.04666634, -0.046481613), target3);
	target3 = MulAdd(g6, MF4x4(0.0268538, 0.043785956, -0.01799385, 0.008743307, -0.013197458, -0.015049436, -0.017189734, -0.0047999253, -0.00059730676, -0.0008936153, -0.016006093, -0.0073406673, 0.014875853, 0.011491735, 9.819833e-05, 0.0073417514), target3);
	target3 = MulAdd(g7, MF4x4(0.019930955, 0.027112626, 0.01307941, 0.005268897, -0.060213763, -0.050415818, -0.069006495, -0.051405095, 0.0036414433, -0.008606397, 0.037427194, -0.0018103109, -0.037434716, 0.010187546, -0.026227329, -0.0033639795), target3);
	target3 = MulAdd(ng6, MF4x4(-0.03634798, 0.0007093891, -0.026819145, 0.009025687, -0.01750318, -0.020098133, -0.0063864207, -0.006606755, 0.0008565766, 0.028647956, -0.0024974607, 0.015250743, -0.048884176, -0.004310685, -0.0010757383, 0.00974984), target3);
	target3 = MulAdd(ng7, MF4x4(-0.031253602, -0.031743724, -0.009083253, 0.0145388115, 0.02048611, 0.0058071036, -0.0038228377, 0.00049654936, 0.0059105973, 0.03437731, -0.025785418, 0.004187733, 0.009980489, -4.08268e-05, 0.009384428, 0.0019492983), target3);
	target3 = MulAdd(g8, MF4x4(0.012587245, -0.0032654977, 0.029739188, -0.009440694, -0.0018237908, -0.0080032, 0.010499013, 0.0012466761, 0.03461923, -0.0060207327, -0.008771263, -0.034545746, -0.015023473, -0.008901684, -0.011490296, -0.01976464), target3);
	target3 = MulAdd(g9, MF4x4(-0.009444331, 0.020809013, 0.009985801, 0.020350901, 0.013234775, 0.004382635, -0.0007761826, -0.005247294, 0.034115106, 0.05190378, 0.039124765, 0.050993033, -0.0898732, -0.030428126, -0.044204578, -0.052484997), target3);
	target3 = MulAdd(ng8, MF4x4(-0.020434443, 0.053520404, 0.0007571144, 0.05895061, -0.018991265, -0.043982152, -0.004035192, -0.025452444, -0.012197152, -0.013770753, 0.0012919102, 0.003996682, 0.0056104586, 0.025686186, 0.05128293, 0.05105745), target3);
	target3 = MulAdd(ng9, MF4x4(0.030201769, 0.052521482, 0.029641917, 0.05559941, 0.0018870027, 0.020112835, -0.0043867202, 0.035877172, 0.02961142, 0.02163634, -0.027972858, -0.040669747, 0.03393723, 0.013455979, 0.03313782, 0.01968004), target3);
	target3 = MulAdd(g10, MF4x4(0.034817442, 0.04045217, 0.054816365, 0.05092461, 0.06600807, 0.062576495, 0.09923777, 0.006663677, -0.039958935, -0.010009866, -0.041522443, -0.04959681, -0.020962957, 0.003845031, -0.04910384, -0.03233655), target3);
	target3 = MulAdd(g11, MF4x4(0.015433112, 0.028965838, -0.0055138534, 0.042267464, -0.012690953, -0.009424165, 0.017896382, 0.01186686, 0.07231686, -0.038834292, 0.07033086, -0.052548733, 0.15721905, 0.09334892, 0.07676042, 0.06701375), target3);
	target3 = MulAdd(ng10, MF4x4(-0.09797534, -0.11201098, -0.0037222446, -0.008105951, -0.01152357, 0.012165641, -0.029051905, -0.021293389, -0.09600697, -0.028819272, -0.069084235, -0.035421908, 0.0054322914, 0.14168966, -0.0200274, 0.06505187), target3);
	target3 = MulAdd(ng11, MF4x4(-0.05034882, -0.06622497, -0.062471002, -0.100628324, 0.018115615, 0.019867867, -0.018836644, 0.007562053, -0.06317378, -0.034458403, -0.047243826, -0.009989589, -0.08270121, -0.018645251, -0.05160367, -0.023690399), target3);
	target3 = MulAdd(g12, MF4x4(0.03897899, -0.10862529, -0.081805214, 0.1202324, -0.021866674, -0.00041882638, -0.018235246, 0.027227063, -0.0656312, -0.053432237, -0.0029235696, 0.03549672, -0.022848906, -0.057047505, 0.013400545, -0.0072439364), target3);
	target3 = MulAdd(g13, MF4x4(0.06879516, -0.018637763, 0.058062725, 0.041045032, 0.011702424, -0.13693465, 0.05674195, -0.11679955, -0.022940686, -0.03856922, -0.07531371, -0.09692582, -0.019870926, -0.032572743, 0.026138868, 0.037639033), target3);
	target3 = MulAdd(ng12, MF4x4(-0.015270301, 0.06478719, 0.011016518, -0.04533957, 0.00688319, 0.024815995, 0.10159924, -0.08467507, 0.11939162, -0.01939453, -0.0058689644, -0.077881604, 0.118726775, 0.14489114, -0.10831982, -0.07972515), target3);
	target3 = MulAdd(ng13, MF4x4(-0.16734359, 0.10685446, -0.102714166, -0.010225307, 0.07306756, 0.07014447, 0.040464073, 0.04688462, -0.05489714, -0.01525318, 0.14690581, 0.17514132, -0.03250648, -0.03688211, 0.05047889, 0.03078089), target3);

	float2 outputPt = GetOutputPt();

	pos -= 0.5f * outputPt;
	OUTPUT[gxy] = MF4(MF3(target1.x, target2.x, target3.x) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);

	++gxy.x;
	pos.x += outputPt.x;
	OUTPUT[gxy] = MF4(MF3(target1.y, target2.y, target3.y) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
	
	++gxy.y;
	pos.y += outputPt.y;
	OUTPUT[gxy] = MF4(MF3(target1.w, target2.w, target3.w) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);

	--gxy.x;
	pos.x -= outputPt.x;
	OUTPUT[gxy] = MF4(MF3(target1.z, target2.z, target3.z) + INPUT.SampleLevel(sam1, pos, 0).rgb, 1);
}
